From 02cac8f857fe21474ab25aa7150bed2ac5b970d5 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 26 Oct 2021 13:54:35 -0700
Subject: flake8 clean (with current settings)

---
 python/sandcrawler/fileset_platforms.py |  8 ++++----
 python/sandcrawler/ia.py                |  4 ++--
 python/sandcrawler/ingest_file.py       |  4 ++--
 python/sandcrawler/ingest_fileset.py    |  8 ++------
 python/sandcrawler/pdfextract.py        |  2 --
 python/sandcrawler/persist.py           |  4 ++--
 python/sandcrawler/workers.py           | 16 ++++++++++------
 7 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'python/sandcrawler')

diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 2811100..c97e639 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -126,7 +126,7 @@ class DataverseHelper(FilesetPlatformHelper):
         # TODO: could also do HTML platform detection or something?
 
         components = urllib.parse.urlparse(url)
-        platform_domain = components.netloc.split(':')[0].lower()
+        # platform_domain = components.netloc.split(':')[0].lower()
         params = urllib.parse.parse_qs(components.query)
         id_param = params.get('persistentId')
         if not id_param:
@@ -134,7 +134,7 @@ class DataverseHelper(FilesetPlatformHelper):
         platform_id = id_param[0]
 
         try:
-            parsed = self.parse_dataverse_persistentid(platform_id)
+            self.parse_dataverse_persistentid(platform_id)
         except ValueError:
             return False
 
@@ -411,7 +411,7 @@ class FigshareHelper(FilesetPlatformHelper):
         resp.raise_for_status()
         obj = resp.json()
 
-        _figshare_type = obj['defined_type_name']
+        # figshare_type = obj['defined_type_name']
 
         if not obj['is_public']:
             raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}')
@@ -552,7 +552,7 @@ class ZenodoHelper(FilesetPlatformHelper):
             raise PlatformScopeError(
                 "got a work-level zenodo record, not a versioned record: {work_id}")
 
-        zenodo_type = obj['metadata']['resource_type']['type']
+        # zenodo_type = obj['metadata']['resource_type']['type']
 
         if obj['metadata']['access_right'] != 'open':
             raise PlatformRestrictedError(
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index fe739bb..9d990bf 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -18,7 +18,7 @@ import requests
 import urllib3.exceptions
 
 # not sure this will really work. Should go before wayback imports.
-http.client._MAXHEADERS = 1000  # noqa
+http.client._MAXHEADERS = 1000  # type: ignore
 
 import wayback.exception
 from gwb.loader import CDXLoaderFactory3
@@ -1153,7 +1153,7 @@ class SavePageNowClient:
                     url=cdx_row.url,
                     datetime=cdx_row.datetime,
                 )
-            except (WaybackError, WaybackContentError) as we:
+            except (WaybackError, WaybackContentError):
                 return ResourceResult(
                     start_url=start_url,
                     hit=False,
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 556e573..bc8643b 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -399,8 +399,8 @@ class IngestFileWorker(SandcrawlerWorker):
         assert resource.body
         try:
             html_doc = HTMLParser(resource.body)
-        except ValueError as ve:
-            return dict(status="html-selectolax-error", )
+        except ValueError:
+            return dict(status="html-selectolax-error")
         html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
         assert html_biblio
         html_body = html_extract_body_teixml(resource.body)
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 4376c89..ea34948 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -110,7 +110,7 @@ class IngestFilesetWorker(IngestFileWorker):
             result['status'] = 'wayback-content-error'
             result['error_message'] = str(e)[:1600]
             return result
-        except NotImplementedError as e:
+        except NotImplementedError:
             #result['status'] = 'not-implemented'
             #result['error_message'] = str(e)[:1600]
             #return result
@@ -269,10 +269,6 @@ class IngestFilesetWorker(IngestFileWorker):
             return result
 
         # 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`.
-        terminal_url = base_url
-        if resource:
-            terminal_url = resource.terminal_url
-
         try:
             dataset_meta = platform_helper.process_request(request, resource, html_biblio)
         except PlatformScopeError as e:
@@ -363,7 +359,7 @@ class IngestFilesetWorker(IngestFileWorker):
         if ingest_strategy.endswith('-file'):
             result['fileset_file'] = dict()
             if archive_result.file_file_meta:
-                result['fileset_file']['file_meta'] = file_meta = archive_result.file_file_meta,
+                result['fileset_file']['file_meta'] = archive_result.file_file_meta,
             if archive_result.file_resource:
                 result['fileset_file']['terminal'] = dict(
                     terminal_url=archive_result.file_resource.terminal_url,
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 9392136..222a408 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -434,8 +434,6 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
         )
 
     def process(self, record, key: Optional[str] = None):
-        default_key = record['sha1hex']
-
         fetch_result = self.fetch_blob(record)
         if fetch_result['status'] != 'success':
             return fetch_result
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 44c03f2..b714bc7 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -578,7 +578,7 @@ class PersistThumbnailWorker(SandcrawlerWorker):
         assert isinstance(blob, bytes)
         assert len(blob) >= 50
 
-        resp = self.s3.put_blob(
+        self.s3.put_blob(
             folder=self.s3_folder,
             blob=blob,
             sha1hex=key,
@@ -619,7 +619,7 @@ class GenericPersistDocWorker(SandcrawlerWorker):
         if 'sha1hex' in record:
             assert key_str == record['sha1hex']
 
-        resp = self.s3.put_blob(
+        self.s3.put_blob(
             folder=self.s3_folder,
             blob=record[self.doc_key].encode('utf-8'),
             sha1hex=key_str,
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 8c604fb..6b08f03 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -181,12 +181,16 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
                 key=default_key,
                 source=record,
                 status="empty-blob",
+                wayback_sec=wayback_sec,
+                petabox_sec=petabox_sec,
             )
         return dict(
             key=default_key,
             status="success",
             source=record,
             blob=blob,
+            wayback_sec=wayback_sec,
+            petabox_sec=petabox_sec,
         )
 
 
@@ -219,9 +223,9 @@ class MultiprocessWrapper(SandcrawlerWorker):
         self.pool.terminate()
         if self.sink:
             self.sink.finish()
-        worker_counts = self.worker.finish()
+        self.worker.finish()
         print("Multiprocessing: {}".format(self.counts), file=sys.stderr)
-        return worker_counts
+        return self.counts
 
 
 class BlackholeSink(SandcrawlerWorker):
@@ -370,7 +374,7 @@ class JsonLinePusher(RecordPusher):
             self.worker.push_batch(batch)
             self.counts['pushed'] += len(batch)
             batch = []
-        worker_counts = self.worker.finish()
+        self.worker.finish()
         print("JSON lines pushed: {}".format(self.counts), file=sys.stderr)
         return self.counts
 
@@ -417,7 +421,7 @@ class CdxLinePusher(RecordPusher):
             self.worker.push_batch(batch)
             self.counts['pushed'] += len(batch)
             batch = []
-        worker_counts = self.worker.finish()
+        self.worker.finish()
         print("CDX lines pushed: {}".format(self.counts), file=sys.stderr)
         return self.counts
 
@@ -456,7 +460,7 @@ class ZipfilePusher(RecordPusher):
             self.worker.push_batch(batch)
             self.counts['pushed'] += len(batch)
             batch = []
-        worker_counts = self.worker.finish()
+        self.worker.finish()
         print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr)
         return self.counts
 
@@ -552,7 +556,7 @@ class KafkaJsonPusher(RecordPusher):
 
         # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
         # commit the current batch if it has been lingering
-        worker_counts = self.worker.finish()
+        self.worker.finish()
         print("KafkaJson lines pushed: {}".format(self.counts), file=sys.stderr)
         self.consumer.close()
         return self.counts
-- 
cgit v1.2.3