aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/test_grobid.py6
-rw-r--r--python/tests/test_html.py5
-rw-r--r--python/tests/test_html_ingest.py4
-rw-r--r--python/tests/test_ingest.py10
-rw-r--r--python/tests/test_live_wayback.py15
-rw-r--r--python/tests/test_misc.py2
-rw-r--r--python/tests/test_pdfextract.py8
-rw-r--r--python/tests/test_pushers.py2
-rw-r--r--python/tests/test_savepagenow.py12
-rw-r--r--python/tests/test_wayback.py4
10 files changed, 26 insertions, 42 deletions
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 55636dc..15d43fb 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -2,9 +2,9 @@ import struct
import pytest
import responses
-from test_wayback import cdx_client, wayback_client
+from test_wayback import cdx_client, wayback_client # noqa:F401
-from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker, WaybackClient
+from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
@@ -58,7 +58,7 @@ def test_grobid_success(grobid_client):
@responses.activate
-def test_grobid_worker_cdx(grobid_client, wayback_client):
+def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
sink = BlackholeSink()
worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index c5f422e..1caca15 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,8 +1,3 @@
-import json
-
-import pytest
-import responses
-
from sandcrawler.html import extract_fulltext_url
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index 3bf94e2..727fef9 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -1,7 +1,3 @@
-import datetime
-
-import pytest
-
from sandcrawler.ingest_html import *
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 79f50f4..f2318c2 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -87,7 +87,7 @@ def test_ingest_success(ingest_worker_pdf):
resp = ingest_worker_pdf.process(request)
print(resp)
- assert resp['hit'] == True
+ assert resp['hit'] is True
assert resp['status'] == "success"
assert resp['request'] == request
assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
@@ -156,7 +156,7 @@ def test_ingest_landing(ingest_worker):
resp = ingest_worker.process(request)
print(resp)
- assert resp['hit'] == False
+ assert resp['hit'] is False
assert resp['status'] == "no-pdf-link"
assert resp['request'] == request
assert 'terminal' in resp
@@ -179,7 +179,7 @@ def test_ingest_blocklist(ingest_worker):
resp = ingest_worker.process(request)
- assert resp['hit'] == False
+ assert resp['hit'] is False
assert resp['status'] == "skip-url-blocklist"
assert resp['request'] == request
@@ -197,7 +197,7 @@ def test_ingest_wall_blocklist(ingest_worker):
resp = ingest_worker.process(request)
- assert resp['hit'] == False
+ assert resp['hit'] is False
assert resp['status'] == "skip-wall"
assert resp['request'] == request
@@ -212,6 +212,6 @@ def test_ingest_cookie_blocklist(ingest_worker):
resp = ingest_worker.process(request)
- assert resp['hit'] == False
+ assert resp['hit'] is False
assert resp['status'] == "blocked-cookie"
assert resp['request'] == request
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 0ff4902..bc74916 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -6,12 +6,9 @@ automatically in CI.
Simply uncomment lines to run.
"""
-import json
-
import pytest
-from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient,
- SavePageNowError, WaybackClient, WaybackError, gen_file_metadata)
+from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata
@pytest.fixture
@@ -89,7 +86,7 @@ def test_lookup_resource_success(wayback_client):
url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url in (url, url.replace("https://", "http://"))
assert resp.cdx.url in (url, url.replace("https://", "http://"))
@@ -139,7 +136,7 @@ def test_lookup_ftp(wayback_client):
url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url == url
assert resp.terminal_status_code == 226
@@ -154,7 +151,7 @@ def test_lookup_ftp(wayback_client):
url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
resp = wayback_client.lookup_resource(url)
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.terminal_url == url
assert resp.terminal_status_code == 226
@@ -171,10 +168,10 @@ def test_crawl_ftp(spn_client, wayback_client):
resp = spn_client.crawl_resource(url, wayback_client)
# FTP isn't supported yet!
- #assert resp.hit == True
+ #assert resp.hit is True
#assert resp.status == "success"
#assert resp.terminal_url == url
#assert resp.cdx.url == url
- assert resp.hit == False
+ assert resp.hit is False
assert resp.status == "spn2-no-ftp"
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index dcc1202..7d3e755 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -83,7 +83,7 @@ def test_invalid_cdx():
print("missing warc")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
- assert parse_cdx_line(raw) == None
+ assert parse_cdx_line(raw) is None
print("bad datetime")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 146b138..086243a 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -2,11 +2,9 @@ import struct
import poppler
import pytest
-import responses
-from test_wayback import cdx_client, wayback_client
+from test_wayback import cdx_client, wayback_client # noqa:F401
-from sandcrawler import (BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker,
- WaybackClient)
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker
from sandcrawler.pdfextract import process_pdf
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
@@ -43,7 +41,7 @@ def test_process_dummy_pdf():
assert resp.pdf_extra['page_count'] == 1
-def test_pdfextract_worker_cdx(wayback_client):
+def test_pdfextract_worker_cdx(wayback_client): # noqa: F811
sink = BlackholeSink()
worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 63f90d3..353a560 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,5 +1,3 @@
-import pytest
-
from sandcrawler.workers import BlackholeSink, CdxLinePusher
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 80334d9..37f0bc9 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -120,7 +120,7 @@ def test_savepagenow_success(spn_client):
assert len(responses.calls) == 4
- assert resp.success == True
+ assert resp.success is True
assert resp.status == "success"
assert resp.request_url == TARGET
assert resp.terminal_url == TARGET + "/redirect"
@@ -151,12 +151,12 @@ def test_savepagenow_remote_error(spn_client):
assert len(responses.calls) == 3
- assert resp.success == False
+ assert resp.success is False
assert resp.status == ERROR_BODY['status_ext']
assert resp.request_url == TARGET
- assert resp.terminal_url == None
- assert resp.terminal_dt == None
- assert resp.resources == None
+ assert resp.terminal_url is None
+ assert resp.terminal_dt is None
+ assert resp.resources is None
@responses.activate
@@ -214,7 +214,7 @@ def test_crawl_resource(spn_client, wayback_client):
assert len(responses.calls) == 5
- assert resp.hit == True
+ assert resp.hit is True
assert resp.status == "success"
assert resp.body == WARC_BODY
assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 6ccf775..9861db2 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -3,7 +3,7 @@ import json
import pytest
import responses
-from sandcrawler import CdxApiClient, CdxApiError, PetaboxError, WaybackClient, WaybackError
+from sandcrawler import CdxApiClient, WaybackClient
CDX_TARGET = "http://fatcat.wiki/"
CDX_DT = "20180812220054"
@@ -215,4 +215,4 @@ def test_lookup_resource_success(wayback_client):
resp = wayback_client.lookup_resource(CDX_TARGET)
- assert resp.hit == True
+ assert resp.hit is True