start handling trivial lint cleanups: unused imports, 'is None', etc

author: Bryan Newbold <bnewbold@archive.org> 2021-10-26 13:35:36 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-26 13:35:36 -0700
commit: 600ad67925a748200ddf21d5aeabd157d2bb3664 (patch)
tree: 89ae6bc24e6eb3821c03efd7d781430345c68aa0 /python
parent: 05bd7cbcc62588e431c5efd533189e246b2a997e (diff)
download: sandcrawler-600ad67925a748200ddf21d5aeabd157d2bb3664.tar.gz
sandcrawler-600ad67925a748200ddf21d5aeabd157d2bb3664.zip
30 files changed, 86 insertions, 149 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 4ba9540..c36fe0a 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -9,7 +9,6 @@ Example of large parallel run, locally:
 """
 
 import argparse
-import datetime
 import json
 import sys
 
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
index 305c3a8..eb1047d 100755
--- a/python/ingest_tool.py
+++ b/python/ingest_tool.py
@@ -42,8 +42,8 @@ def run_requests(args):
         html_quick_mode=args.html_quick_mode,
     )
     fileset_worker = IngestFilesetWorker(try_spn2=not args.no_spn2, )
-    for l in args.json_file:
-        request = json.loads(l.strip())
+    for line in args.json_file:
+        request = json.loads(line.strip())
         if request['ingest_type'] in [
                 'dataset',
         ]:
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index 717b743..f3df6e3 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -4,11 +4,8 @@ KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode
 """
 
 import argparse
-import datetime
-import json
 import sys
 
-from grobid2json import teixml2json
 from sandcrawler import *
 
 
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index 9316313..dbe5b10 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -9,8 +9,6 @@ cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftri
 """
 
 import argparse
-import datetime
-import json
 import sys
 
 from sandcrawler import *
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index f3441c9..2811100 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -1,10 +1,5 @@
-import gzip
-import json
-import sys
-import time
 import urllib.parse
-from collections import namedtuple
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Optional, Tuple
 
 import internetarchive
 import requests
@@ -175,12 +170,12 @@ class DataverseHelper(FilesetPlatformHelper):
         try:
             parsed_id = self.parse_dataverse_persistentid(platform_id)
         except ValueError:
-            raise PlatformScopeError(f"not actually in scope")
+            raise PlatformScopeError("not actually in scope")
 
         if parsed_id['file_id']:
             # XXX: maybe we could support this?
             raise PlatformScopeError(
-                f"only entire dataverse datasets can be archived with this tool")
+                "only entire dataverse datasets can be archived with this tool")
 
         # 1b. if we didn't get a version number from URL, fetch it from API
         if not dataset_version:
@@ -277,13 +272,6 @@ def test_parse_dataverse_persistentid():
             "dataset_id": "LL6WXZ",
             "file_id": None,
         },
-        "doi:10.25625/LL6WXZ": {
-            "type": "doi",
-            "authority": "10.25625",
-            "shoulder": None,
-            "dataset_id": "LL6WXZ",
-            "file_id": None,
-        },
         "doi:10.5072/FK2/J8SJZB": {
             "type": "doi",
             "authority": "10.5072",
@@ -423,7 +411,7 @@ class FigshareHelper(FilesetPlatformHelper):
         resp.raise_for_status()
         obj = resp.json()
 
-        figshare_type = obj['defined_type_name']
+        _figshare_type = obj['defined_type_name']
 
         if not obj['is_public']:
             raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}')
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index 6c25276..4e44d97 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -1,19 +1,13 @@
-import gzip
-import json
 import os
 import shutil
 import sys
-import time
-from collections import namedtuple
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Optional
 
 import internetarchive
 
-from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile,
-                                       FilesetPlatformItem, IngestStrategy, PlatformScopeError)
-from sandcrawler.html_metadata import BiblioMetadata
-from sandcrawler.ia import (ResourceResult, SavePageNowClient, WaybackClient,
-                            fix_transfer_encoding)
+from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetPlatformItem,
+                                       IngestStrategy, PlatformScopeError)
+from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding
 from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
 
 
@@ -233,7 +227,7 @@ class WebFilesetStrategy(FilesetIngestStrategy):
             via = "wayback"
             resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype)
 
-            if self.try_spn2 and (resource == None or
+            if self.try_spn2 and (resource is None or
                                   (resource and resource.status == 'no-capture')):
                 if len(item.manifest) > self.max_spn_manifest:
                     m.status = 'too-much-spn'
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
index 606af07..f543ede 100644
--- a/python/sandcrawler/fileset_types.py
+++ b/python/sandcrawler/fileset_types.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 
 from pydantic import BaseModel
 
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 16bbb01..d0b7f7e 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -23,7 +23,7 @@ class GrobidClient(object):
         """
         assert blob
 
-        if consolidate_mode == None:
+        if consolidate_mode is None:
             consolidate_mode = self.consolidate_mode
 
         try:
@@ -100,8 +100,6 @@ class GrobidWorker(SandcrawlerFetchWorker):
         )
 
     def process(self, record, key=None):
-        default_key = record['sha1hex']
-
         fetch_result = self.fetch_blob(record)
         if fetch_result['status'] != 'success':
             return fetch_result
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index a44fc67..5b9742a 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -53,12 +53,12 @@ def extract_fulltext_url(html_url, html_body):
             print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr)
         elif url.startswith('/'):
             if host_prefix + url == html_url:
-                print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
+                print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
             else:
                 return dict(pdf_url=host_prefix + url, technique='citation_pdf_url')
         elif url.startswith('http'):
             if url == html_url:
-                print(f"\tavoiding citation_pdf_url link-loop", file=sys.stderr)
+                print("\tavoiding citation_pdf_url link-loop", file=sys.stderr)
             else:
                 return dict(pdf_url=url, technique='citation_pdf_url')
         else:
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 6d27a3a..15a9f2b 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,7 +1,7 @@
 import datetime
 import sys
 import urllib.parse
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 import braveblock
 import dateparser
@@ -687,7 +687,7 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser,
             continue
         return (val, pattern.get('technique', 'unknown'))
     if self_doc_url:
-        print(f"  WARN: returning fulltext URL pointing to self", file=sys.stderr)
+        print("  WARN: returning fulltext URL pointing to self", file=sys.stderr)
         return self_doc_url
     return None
 
@@ -864,7 +864,7 @@ def html_extract_resources(doc_url: str, doc: HTMLParser,
     # filter using adblocker
     resources = [
         r for r in resources if adblock.check_network_urls(
-            r['url'], source_url=doc_url, request_type=r['type']) == False
+            r['url'], source_url=doc_url, request_type=r['type']) is False
     ]
 
     # remove duplicates
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index a8ce193..fe739bb 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -11,15 +11,14 @@ import sys
 import time
 import urllib.parse
 from collections import namedtuple
+from http.client import IncompleteRead
 from typing import Tuple
 
 import requests
 import urllib3.exceptions
 
 # not sure this will really work. Should go before wayback imports.
-http.client._MAXHEADERS = 1000  # type: ignore
-
-from http.client import IncompleteRead
+http.client._MAXHEADERS = 1000  # noqa
 
 import wayback.exception
 from gwb.loader import CDXLoaderFactory3
@@ -128,18 +127,18 @@ def fuzzy_match_url(left, right):
 
 
 def test_fuzzy_match_url():
-    assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
-    assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
-    assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True
-    assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True
-    assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True
-    assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True
-    assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False
+    assert fuzzy_match_url("http://thing.com", "http://thing.com") is True
+    assert fuzzy_match_url("http://thing.com", "https://thing.com") is True
+    assert fuzzy_match_url("http://thing.com", "ftp://thing.com") is True
+    assert fuzzy_match_url("http://thing.com", "http://thing.com/") is True
+    assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True
+    assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True
+    assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False
 
     # should probably handle these?
-    assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False
-    assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
-    assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False
+    assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False
+    assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") is False
+    assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") is False
 
 
 class CdxApiError(Exception):
@@ -951,7 +950,7 @@ class SavePageNowClient:
             resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, job_id))
             try:
                 resp.raise_for_status()
-            except:
+            except Exception:
                 raise SavePageNowError(resp.content)
             status = resp.json()['status']
             if status == 'pending':
@@ -975,7 +974,7 @@ class SavePageNowClient:
                                                              final_json['original_job_id']))
             try:
                 resp.raise_for_status()
-            except:
+            except Exception:
                 raise SavePageNowError(resp.content)
             final_json = resp.json()
 
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index b480cc2..556e573 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -1,23 +1,19 @@
-import base64
-import gzip
 import json
 import sys
 import time
 import xml.etree.ElementTree
-from collections import namedtuple
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from typing import Any, Dict, List, Optional, Tuple
+from http.server import BaseHTTPRequestHandler
+from typing import Any, Dict, List, Optional
 
-import requests
 from selectolax.parser import HTMLParser
 
 from sandcrawler.db import SandcrawlerPostgrestClient
 from sandcrawler.grobid import GrobidClient
 from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
-                                       html_extract_resources, load_adblock_rules)
-from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError,
-                            ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient,
+from sandcrawler.html_metadata import (html_extract_biblio, html_extract_resources,
+                                       load_adblock_rules)
+from sandcrawler.ia import (CdxApiError, NoCaptureError, PetaboxError, ResourceResult,
+                            SavePageNowClient, SavePageNowError, WaybackClient,
                             WaybackContentError, WaybackError, cdx_to_dict,
                             fix_transfer_encoding)
 from sandcrawler.ingest_html import (WebResource, fetch_html_resources,
@@ -211,7 +207,7 @@ class IngestFileWorker(SandcrawlerWorker):
             return None
         existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url)
         # TODO: filter on more flags?
-        if existing and existing['hit'] == True:
+        if existing and existing['hit'] is True:
             return existing
         else:
             return None
@@ -249,7 +245,7 @@ class IngestFileWorker(SandcrawlerWorker):
         if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000':
             old_failure = True
 
-        if self.try_spn2 and (resource == None or (resource and resource.status == 'no-capture')
+        if self.try_spn2 and (resource is None or (resource and resource.status == 'no-capture')
                               or soft404 or old_failure):
             via = "spn2"
             resource = self.spn_client.crawl_resource(url, self.wayback_client)
@@ -751,7 +747,7 @@ class IngestFileWorker(SandcrawlerWorker):
 
         # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
         assert resource
-        assert resource.hit == True
+        assert resource.hit is True
         assert resource.terminal_status_code in (200, 226)
 
         if resource.terminal_url:
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 5cbb908..4376c89 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -1,30 +1,19 @@
-import gzip
 import json
 import sys
 import time
-from collections import namedtuple
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Optional
 
 import requests
 from selectolax.parser import HTMLParser
 
-from sandcrawler.db import SandcrawlerPostgrestClient
-from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper
-from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy
+from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE
+from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE
 from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
-                                       html_extract_resources, load_adblock_rules)
-from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError,
-                            ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient,
-                            WaybackContentError, WaybackError, cdx_to_dict,
-                            fix_transfer_encoding)
+from sandcrawler.html_metadata import html_extract_biblio
+from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, WaybackContentError,
+                            WaybackError, cdx_to_dict, fix_transfer_encoding)
 from sandcrawler.ingest_file import IngestFileWorker
-from sandcrawler.ingest_html import (WebResource, fetch_html_resources,
-                                     html_extract_body_teixml, html_guess_platform,
-                                     html_guess_scope, quick_fetch_html_resources)
-from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
-from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.misc import clean_url, gen_file_metadata
 
 MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
 
@@ -61,7 +50,7 @@ class IngestFilesetWorker(IngestFileWorker):
             return None
         existing = self.pgrest_client.get_ingest_fileset_result(ingest_type, base_url)
         # TODO: filter on more flags?
-        if existing and existing['hit'] == True:
+        if existing and existing['hit'] is True:
             return existing
         else:
             return None
@@ -196,7 +185,7 @@ class IngestFilesetWorker(IngestFileWorker):
 
             # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
             assert resource
-            assert resource.hit == True
+            assert resource.hit is True
             assert resource.terminal_status_code in (200, 226)
 
             if resource.terminal_url:
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index bf25d5d..91e5c6e 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,6 +1,5 @@
 import argparse
 import datetime
-import io
 import json
 import sys
 import xml.etree.ElementTree as ET
@@ -12,9 +11,9 @@ from selectolax.parser import HTMLParser
 
 from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
                                        html_extract_resources, load_adblock_rules)
-from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient,
-                            WaybackContentError, cdx_to_dict, fix_transfer_encoding)
-from sandcrawler.misc import (clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime,
+from sandcrawler.ia import (CdxApiClient, NoCaptureError, WaybackClient, WaybackContentError,
+                            cdx_to_dict, fix_transfer_encoding)
+from sandcrawler.misc import (datetime_to_cdx, gen_file_metadata, parse_cdx_datetime,
                               url_fuzzy_equal)
 
 TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
@@ -147,7 +146,7 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
                   file=sys.stderr)
         if not cdx_row.status_code:
             # TODO: fall back to a full fetch?
-            print(f"  WARN: skipping revisit record", file=sys.stderr)
+            print("  WARN: skipping revisit record", file=sys.stderr)
             continue
         full.append(
             WebResource(
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 188621f..046db9e 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,6 +1,5 @@
 import hashlib
 import io
-import os
 
 import minio
 
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index ddbd95a..5ca7a4b 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -35,10 +35,10 @@ def url_fuzzy_equal(left: str, right: str) -> bool:
 
 
 def test_url_fuzzy_equal() -> None:
-    assert True == url_fuzzy_equal(
+    assert url_fuzzy_equal(
         "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree",
         "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree"
-    )
+    ) is True
 
 
 def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict:
@@ -239,8 +239,8 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
 
 
 def test_parse_cdx_datetime() -> None:
-    assert parse_cdx_datetime("") == None
-    assert parse_cdx_datetime("asdf") == None
+    assert parse_cdx_datetime("") is None
+    assert parse_cdx_datetime("asdf") is None
     assert parse_cdx_datetime("19930203123045") != None
     assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020,
                                                                      month=10,
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 190672d..9392136 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -293,7 +293,7 @@ def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExt
         return PdfExtractResult(
             sha1hex=sha1hex,
             status='bad-pdf',
-            error_msg=f"PDF known to cause processing issues",
+            error_msg="PDF known to cause processing issues",
             file_meta=file_meta,
         )
 
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index e3d4a54..ba875cd 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -75,7 +75,6 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
 
     def process(self, record, key=None):
         start_process = time.time()
-        default_key = record['sha1hex']
         fetch_sec = None
 
         start = time.time()
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 7135f4c..8c604fb 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -116,7 +116,6 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
         self.wayback_client = wayback_client
 
     def fetch_blob(self, record):
-        start_process = time.time()
         default_key = record['sha1hex']
         wayback_sec = None
         petabox_sec = None
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 3c76c17..3e35807 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -6,7 +6,6 @@ or S3 (SeaweedFS).
 """
 
 import argparse
-import datetime
 import os
 import sys
 
@@ -18,7 +17,7 @@ from sandcrawler.persist import PersistHtmlTeiXmlWorker, PersistXmlDocWorker
 # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
 try:
     git_sha = raven.fetch_git_sha('..')
-except Exception as e:
+except Exception:
     git_sha = None
 sentry_client = raven.Client(release=git_sha)
 
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 55636dc..15d43fb 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -2,9 +2,9 @@ import struct
 
 import pytest
 import responses
-from test_wayback import cdx_client, wayback_client
+from test_wayback import cdx_client, wayback_client  # noqa:F401
 
-from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker, WaybackClient
+from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker
 
 FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
 
@@ -58,7 +58,7 @@ def test_grobid_success(grobid_client):
 
 
 @responses.activate
-def test_grobid_worker_cdx(grobid_client, wayback_client):
+def test_grobid_worker_cdx(grobid_client, wayback_client):  # noqa: F811
 
     sink = BlackholeSink()
     worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index c5f422e..1caca15 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,8 +1,3 @@
-import json
-
-import pytest
-import responses
-
 from sandcrawler.html import extract_fulltext_url
 
 
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index 3bf94e2..727fef9 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -1,7 +1,3 @@
-import datetime
-
-import pytest
-
 from sandcrawler.ingest_html import *
 
 
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 79f50f4..f2318c2 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -87,7 +87,7 @@ def test_ingest_success(ingest_worker_pdf):
     resp = ingest_worker_pdf.process(request)
 
     print(resp)
-    assert resp['hit'] == True
+    assert resp['hit'] is True
     assert resp['status'] == "success"
     assert resp['request'] == request
     assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
@@ -156,7 +156,7 @@ def test_ingest_landing(ingest_worker):
     resp = ingest_worker.process(request)
 
     print(resp)
-    assert resp['hit'] == False
+    assert resp['hit'] is False
     assert resp['status'] == "no-pdf-link"
     assert resp['request'] == request
     assert 'terminal' in resp
@@ -179,7 +179,7 @@ def test_ingest_blocklist(ingest_worker):
 
     resp = ingest_worker.process(request)
 
-    assert resp['hit'] == False
+    assert resp['hit'] is False
     assert resp['status'] == "skip-url-blocklist"
     assert resp['request'] == request
 
@@ -197,7 +197,7 @@ def test_ingest_wall_blocklist(ingest_worker):
 
     resp = ingest_worker.process(request)
 
-    assert resp['hit'] == False
+    assert resp['hit'] is False
     assert resp['status'] == "skip-wall"
     assert resp['request'] == request
 
@@ -212,6 +212,6 @@ def test_ingest_cookie_blocklist(ingest_worker):
 
     resp = ingest_worker.process(request)
 
-    assert resp['hit'] == False
+    assert resp['hit'] is False
     assert resp['status'] == "blocked-cookie"
     assert resp['request'] == request
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 0ff4902..bc74916 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -6,12 +6,9 @@ automatically in CI.
 Simply uncomment lines to run.
 """
 
-import json
-
 import pytest
 
-from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient,
-                         SavePageNowError, WaybackClient, WaybackError, gen_file_metadata)
+from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata
 
 
 @pytest.fixture
@@ -89,7 +86,7 @@ def test_lookup_resource_success(wayback_client):
     url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
     resp = wayback_client.lookup_resource(url)
 
-    assert resp.hit == True
+    assert resp.hit is True
     assert resp.status == "success"
     assert resp.terminal_url in (url, url.replace("https://", "http://"))
     assert resp.cdx.url in (url, url.replace("https://", "http://"))
@@ -139,7 +136,7 @@ def test_lookup_ftp(wayback_client):
     url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
     resp = wayback_client.lookup_resource(url)
 
-    assert resp.hit == True
+    assert resp.hit is True
     assert resp.status == "success"
     assert resp.terminal_url == url
     assert resp.terminal_status_code == 226
@@ -154,7 +151,7 @@ def test_lookup_ftp(wayback_client):
     url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
     resp = wayback_client.lookup_resource(url)
 
-    assert resp.hit == True
+    assert resp.hit is True
     assert resp.status == "success"
     assert resp.terminal_url == url
     assert resp.terminal_status_code == 226
@@ -171,10 +168,10 @@ def test_crawl_ftp(spn_client, wayback_client):
     resp = spn_client.crawl_resource(url, wayback_client)
 
     # FTP isn't supported yet!
-    #assert resp.hit == True
+    #assert resp.hit is True
     #assert resp.status == "success"
     #assert resp.terminal_url == url
     #assert resp.cdx.url == url
 
-    assert resp.hit == False
+    assert resp.hit is False
     assert resp.status == "spn2-no-ftp"
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index dcc1202..7d3e755 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -83,7 +83,7 @@ def test_invalid_cdx():
 
     print("missing warc")
     raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
-    assert parse_cdx_line(raw) == None
+    assert parse_cdx_line(raw) is None
 
     print("bad datetime")
     raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 146b138..086243a 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -2,11 +2,9 @@ import struct
 
 import poppler
 import pytest
-import responses
-from test_wayback import cdx_client, wayback_client
+from test_wayback import cdx_client, wayback_client  # noqa:F401
 
-from sandcrawler import (BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker,
-                         WaybackClient)
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
 from sandcrawler.pdfextract import process_pdf
 
 FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
@@ -43,7 +41,7 @@ def test_process_dummy_pdf():
     assert resp.pdf_extra['page_count'] == 1
 
 
-def test_pdfextract_worker_cdx(wayback_client):
+def test_pdfextract_worker_cdx(wayback_client):  # noqa: F811
 
     sink = BlackholeSink()
     worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 63f90d3..353a560 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,5 +1,3 @@
-import pytest
-
 from sandcrawler.workers import BlackholeSink, CdxLinePusher
 
 
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 80334d9..37f0bc9 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -120,7 +120,7 @@ def test_savepagenow_success(spn_client):
 
     assert len(responses.calls) == 4
 
-    assert resp.success == True
+    assert resp.success is True
     assert resp.status == "success"
     assert resp.request_url == TARGET
     assert resp.terminal_url == TARGET + "/redirect"
@@ -151,12 +151,12 @@ def test_savepagenow_remote_error(spn_client):
 
     assert len(responses.calls) == 3
 
-    assert resp.success == False
+    assert resp.success is False
     assert resp.status == ERROR_BODY['status_ext']
     assert resp.request_url == TARGET
-    assert resp.terminal_url == None
-    assert resp.terminal_dt == None
-    assert resp.resources == None
+    assert resp.terminal_url is None
+    assert resp.terminal_dt is None
+    assert resp.resources is None
 
 
 @responses.activate
@@ -214,7 +214,7 @@ def test_crawl_resource(spn_client, wayback_client):
 
     assert len(responses.calls) == 5
 
-    assert resp.hit == True
+    assert resp.hit is True
     assert resp.status == "success"
     assert resp.body == WARC_BODY
     assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 6ccf775..9861db2 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -3,7 +3,7 @@ import json
 import pytest
 import responses
 
-from sandcrawler import CdxApiClient, CdxApiError, PetaboxError, WaybackClient, WaybackError
+from sandcrawler import CdxApiClient, WaybackClient
 
 CDX_TARGET = "http://fatcat.wiki/"
 CDX_DT = "20180812220054"
@@ -215,4 +215,4 @@ def test_lookup_resource_success(wayback_client):
 
     resp = wayback_client.lookup_resource(CDX_TARGET)
 
-    assert resp.hit == True
+    assert resp.hit is True
author	Bryan Newbold <bnewbold@archive.org>	2021-10-26 13:35:36 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-26 13:35:36 -0700
commit	600ad67925a748200ddf21d5aeabd157d2bb3664 (patch)
tree	89ae6bc24e6eb3821c03efd7d781430345c68aa0 /python
parent	05bd7cbcc62588e431c5efd533189e246b2a997e (diff)
download	sandcrawler-600ad67925a748200ddf21d5aeabd157d2bb3664.tar.gz sandcrawler-600ad67925a748200ddf21d5aeabd157d2bb3664.zip