From 600ad67925a748200ddf21d5aeabd157d2bb3664 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 26 Oct 2021 13:35:36 -0700
Subject: start handling trivial lint cleanups: unused imports, 'is None', etc

---
 python/tests/test_grobid.py       |  6 +++---
 python/tests/test_html.py         |  5 -----
 python/tests/test_html_ingest.py  |  4 ----
 python/tests/test_ingest.py       | 10 +++++-----
 python/tests/test_live_wayback.py | 15 ++++++---------
 python/tests/test_misc.py         |  2 +-
 python/tests/test_pdfextract.py   |  8 +++-----
 python/tests/test_pushers.py      |  2 --
 python/tests/test_savepagenow.py  | 12 ++++++------
 python/tests/test_wayback.py      |  4 ++--
 10 files changed, 26 insertions(+), 42 deletions(-)

(limited to 'python/tests')

diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 55636dc..15d43fb 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -2,9 +2,9 @@ import struct
 
 import pytest
 import responses
-from test_wayback import cdx_client, wayback_client
+from test_wayback import cdx_client, wayback_client  # noqa:F401
 
-from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker, WaybackClient
+from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker
 
 FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
 
@@ -58,7 +58,7 @@ def test_grobid_success(grobid_client):
 
 
 @responses.activate
-def test_grobid_worker_cdx(grobid_client, wayback_client):
+def test_grobid_worker_cdx(grobid_client, wayback_client):  # noqa: F811
 
     sink = BlackholeSink()
     worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index c5f422e..1caca15 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,8 +1,3 @@
-import json
-
-import pytest
-import responses
-
 from sandcrawler.html import extract_fulltext_url
 
 
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index 3bf94e2..727fef9 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -1,7 +1,3 @@
-import datetime
-
-import pytest
-
 from sandcrawler.ingest_html import *
 
 
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 79f50f4..f2318c2 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -87,7 +87,7 @@ def test_ingest_success(ingest_worker_pdf):
     resp = ingest_worker_pdf.process(request)
 
     print(resp)
-    assert resp['hit'] == True
+    assert resp['hit'] is True
     assert resp['status'] == "success"
     assert resp['request'] == request
     assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
@@ -156,7 +156,7 @@ def test_ingest_landing(ingest_worker):
     resp = ingest_worker.process(request)
 
     print(resp)
-    assert resp['hit'] == False
+    assert resp['hit'] is False
     assert resp['status'] == "no-pdf-link"
     assert resp['request'] == request
     assert 'terminal' in resp
@@ -179,7 +179,7 @@ def test_ingest_blocklist(ingest_worker):
 
     resp = ingest_worker.process(request)
 
-    assert resp['hit'] == False
+    assert resp['hit'] is False
     assert resp['status'] == "skip-url-blocklist"
     assert resp['request'] == request
 
@@ -197,7 +197,7 @@ def test_ingest_wall_blocklist(ingest_worker):
 
     resp = ingest_worker.process(request)
 
-    assert resp['hit'] == False
+    assert resp['hit'] is False
     assert resp['status'] == "skip-wall"
     assert resp['request'] == request
 
@@ -212,6 +212,6 @@ def test_ingest_cookie_blocklist(ingest_worker):
 
     resp = ingest_worker.process(request)
 
-    assert resp['hit'] == False
+    assert resp['hit'] is False
     assert resp['status'] == "blocked-cookie"
     assert resp['request'] == request
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 0ff4902..bc74916 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -6,12 +6,9 @@ automatically in CI.
 Simply uncomment lines to run.
 """
 
-import json
-
 import pytest
 
-from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient,
-                         SavePageNowError, WaybackClient, WaybackError, gen_file_metadata)
+from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata
 
 
 @pytest.fixture
@@ -89,7 +86,7 @@ def test_lookup_resource_success(wayback_client):
     url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
     resp = wayback_client.lookup_resource(url)
 
-    assert resp.hit == True
+    assert resp.hit is True
     assert resp.status == "success"
     assert resp.terminal_url in (url, url.replace("https://", "http://"))
     assert resp.cdx.url in (url, url.replace("https://", "http://"))
@@ -139,7 +136,7 @@ def test_lookup_ftp(wayback_client):
     url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
     resp = wayback_client.lookup_resource(url)
 
-    assert resp.hit == True
+    assert resp.hit is True
     assert resp.status == "success"
     assert resp.terminal_url == url
     assert resp.terminal_status_code == 226
@@ -154,7 +151,7 @@ def test_lookup_ftp(wayback_client):
     url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
     resp = wayback_client.lookup_resource(url)
 
-    assert resp.hit == True
+    assert resp.hit is True
     assert resp.status == "success"
     assert resp.terminal_url == url
     assert resp.terminal_status_code == 226
@@ -171,10 +168,10 @@ def test_crawl_ftp(spn_client, wayback_client):
     resp = spn_client.crawl_resource(url, wayback_client)
 
     # FTP isn't supported yet!
-    #assert resp.hit == True
+    #assert resp.hit is True
     #assert resp.status == "success"
     #assert resp.terminal_url == url
     #assert resp.cdx.url == url
 
-    assert resp.hit == False
+    assert resp.hit is False
     assert resp.status == "spn2-no-ftp"
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index dcc1202..7d3e755 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -83,7 +83,7 @@ def test_invalid_cdx():
 
     print("missing warc")
     raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
-    assert parse_cdx_line(raw) == None
+    assert parse_cdx_line(raw) is None
 
     print("bad datetime")
     raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 146b138..086243a 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -2,11 +2,9 @@ import struct
 
 import poppler
 import pytest
-import responses
-from test_wayback import cdx_client, wayback_client
+from test_wayback import cdx_client, wayback_client  # noqa:F401
 
-from sandcrawler import (BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker,
-                         WaybackClient)
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
 from sandcrawler.pdfextract import process_pdf
 
 FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
@@ -43,7 +41,7 @@ def test_process_dummy_pdf():
     assert resp.pdf_extra['page_count'] == 1
 
 
-def test_pdfextract_worker_cdx(wayback_client):
+def test_pdfextract_worker_cdx(wayback_client):  # noqa: F811
 
     sink = BlackholeSink()
     worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 63f90d3..353a560 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,5 +1,3 @@
-import pytest
-
 from sandcrawler.workers import BlackholeSink, CdxLinePusher
 
 
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 80334d9..37f0bc9 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -120,7 +120,7 @@ def test_savepagenow_success(spn_client):
 
     assert len(responses.calls) == 4
 
-    assert resp.success == True
+    assert resp.success is True
     assert resp.status == "success"
     assert resp.request_url == TARGET
     assert resp.terminal_url == TARGET + "/redirect"
@@ -151,12 +151,12 @@ def test_savepagenow_remote_error(spn_client):
 
     assert len(responses.calls) == 3
 
-    assert resp.success == False
+    assert resp.success is False
     assert resp.status == ERROR_BODY['status_ext']
     assert resp.request_url == TARGET
-    assert resp.terminal_url == None
-    assert resp.terminal_dt == None
-    assert resp.resources == None
+    assert resp.terminal_url is None
+    assert resp.terminal_dt is None
+    assert resp.resources is None
 
 
 @responses.activate
@@ -214,7 +214,7 @@ def test_crawl_resource(spn_client, wayback_client):
 
     assert len(responses.calls) == 5
 
-    assert resp.hit == True
+    assert resp.hit is True
     assert resp.status == "success"
     assert resp.body == WARC_BODY
     assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 6ccf775..9861db2 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -3,7 +3,7 @@ import json
 import pytest
 import responses
 
-from sandcrawler import CdxApiClient, CdxApiError, PetaboxError, WaybackClient, WaybackError
+from sandcrawler import CdxApiClient, WaybackClient
 
 CDX_TARGET = "http://fatcat.wiki/"
 CDX_DT = "20180812220054"
@@ -215,4 +215,4 @@ def test_lookup_resource_success(wayback_client):
 
     resp = wayback_client.lookup_resource(CDX_TARGET)
 
-    assert resp.hit == True
+    assert resp.hit is True
-- 
cgit v1.2.3