diff options
Diffstat (limited to 'python/tests')
-rw-r--r-- | python/tests/test_grobid.py | 6 | ||||
-rw-r--r-- | python/tests/test_html.py | 5 | ||||
-rw-r--r-- | python/tests/test_html_ingest.py | 4 | ||||
-rw-r--r-- | python/tests/test_ingest.py | 10 | ||||
-rw-r--r-- | python/tests/test_live_wayback.py | 15 | ||||
-rw-r--r-- | python/tests/test_misc.py | 2 | ||||
-rw-r--r-- | python/tests/test_pdfextract.py | 8 | ||||
-rw-r--r-- | python/tests/test_pushers.py | 2 | ||||
-rw-r--r-- | python/tests/test_savepagenow.py | 12 | ||||
-rw-r--r-- | python/tests/test_wayback.py | 4 |
10 files changed, 26 insertions, 42 deletions
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py index 55636dc..15d43fb 100644 --- a/python/tests/test_grobid.py +++ b/python/tests/test_grobid.py @@ -2,9 +2,9 @@ import struct import pytest import responses -from test_wayback import cdx_client, wayback_client +from test_wayback import cdx_client, wayback_client # noqa:F401 -from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker, WaybackClient +from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) @@ -58,7 +58,7 @@ def test_grobid_success(grobid_client): @responses.activate -def test_grobid_worker_cdx(grobid_client, wayback_client): +def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811 sink = BlackholeSink() worker = GrobidWorker(grobid_client, wayback_client, sink=sink) diff --git a/python/tests/test_html.py b/python/tests/test_html.py index c5f422e..1caca15 100644 --- a/python/tests/test_html.py +++ b/python/tests/test_html.py @@ -1,8 +1,3 @@ -import json - -import pytest -import responses - from sandcrawler.html import extract_fulltext_url diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py index 3bf94e2..727fef9 100644 --- a/python/tests/test_html_ingest.py +++ b/python/tests/test_html_ingest.py @@ -1,7 +1,3 @@ -import datetime - -import pytest - from sandcrawler.ingest_html import * diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 79f50f4..f2318c2 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -87,7 +87,7 @@ def test_ingest_success(ingest_worker_pdf): resp = ingest_worker_pdf.process(request) print(resp) - assert resp['hit'] == True + assert resp['hit'] is True assert resp['status'] == "success" assert resp['request'] == request assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex'] @@ -156,7 +156,7 @@ def test_ingest_landing(ingest_worker): resp = ingest_worker.process(request) print(resp) - assert resp['hit'] == False + assert resp['hit'] is False assert resp['status'] == "no-pdf-link" assert resp['request'] == request assert 'terminal' in resp @@ -179,7 +179,7 @@ def test_ingest_blocklist(ingest_worker): resp = ingest_worker.process(request) - assert resp['hit'] == False + assert resp['hit'] is False assert resp['status'] == "skip-url-blocklist" assert resp['request'] == request @@ -197,7 +197,7 @@ def test_ingest_wall_blocklist(ingest_worker): resp = ingest_worker.process(request) - assert resp['hit'] == False + assert resp['hit'] is False assert resp['status'] == "skip-wall" assert resp['request'] == request @@ -212,6 +212,6 @@ def test_ingest_cookie_blocklist(ingest_worker): resp = ingest_worker.process(request) - assert resp['hit'] == False + assert resp['hit'] is False assert resp['status'] == "blocked-cookie" assert resp['request'] == request diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py index 0ff4902..bc74916 100644 --- a/python/tests/test_live_wayback.py +++ b/python/tests/test_live_wayback.py @@ -6,12 +6,9 @@ automatically in CI. Simply uncomment lines to run. """ -import json - import pytest -from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient, - SavePageNowError, WaybackClient, WaybackError, gen_file_metadata) +from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata @pytest.fixture @@ -89,7 +86,7 @@ def test_lookup_resource_success(wayback_client): url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable" resp = wayback_client.lookup_resource(url) - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.terminal_url in (url, url.replace("https://", "http://")) assert resp.cdx.url in (url, url.replace("https://", "http://")) @@ -139,7 +136,7 @@ def test_lookup_ftp(wayback_client): url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf" resp = wayback_client.lookup_resource(url) - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.terminal_url == url assert resp.terminal_status_code == 226 @@ -154,7 +151,7 @@ def test_lookup_ftp(wayback_client): url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf" resp = wayback_client.lookup_resource(url) - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.terminal_url == url assert resp.terminal_status_code == 226 @@ -171,10 +168,10 @@ def test_crawl_ftp(spn_client, wayback_client): resp = spn_client.crawl_resource(url, wayback_client) # FTP isn't supported yet! - #assert resp.hit == True + #assert resp.hit is True #assert resp.status == "success" #assert resp.terminal_url == url #assert resp.cdx.url == url - assert resp.hit == False + assert resp.hit is False assert resp.status == "spn2-no-ftp" diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index dcc1202..7d3e755 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -83,7 +83,7 @@ def test_invalid_cdx(): print("missing warc") raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -" - assert parse_cdx_line(raw) == None + assert parse_cdx_line(raw) is None print("bad datetime") raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py index 146b138..086243a 100644 --- a/python/tests/test_pdfextract.py +++ b/python/tests/test_pdfextract.py @@ -2,11 +2,9 @@ import struct import poppler import pytest -import responses -from test_wayback import cdx_client, wayback_client +from test_wayback import cdx_client, wayback_client # noqa:F401 -from sandcrawler import (BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker, - WaybackClient) +from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker from sandcrawler.pdfextract import process_pdf FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) @@ -43,7 +41,7 @@ def test_process_dummy_pdf(): assert resp.pdf_extra['page_count'] == 1 -def test_pdfextract_worker_cdx(wayback_client): +def test_pdfextract_worker_cdx(wayback_client): # noqa: F811 sink = BlackholeSink() worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink) diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py index 63f90d3..353a560 100644 --- a/python/tests/test_pushers.py +++ b/python/tests/test_pushers.py @@ -1,5 +1,3 @@ -import pytest - from sandcrawler.workers import BlackholeSink, CdxLinePusher diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py index 80334d9..37f0bc9 100644 --- a/python/tests/test_savepagenow.py +++ b/python/tests/test_savepagenow.py @@ -120,7 +120,7 @@ def test_savepagenow_success(spn_client): assert len(responses.calls) == 4 - assert resp.success == True + assert resp.success is True assert resp.status == "success" assert resp.request_url == TARGET assert resp.terminal_url == TARGET + "/redirect" @@ -151,12 +151,12 @@ def test_savepagenow_remote_error(spn_client): assert len(responses.calls) == 3 - assert resp.success == False + assert resp.success is False assert resp.status == ERROR_BODY['status_ext'] assert resp.request_url == TARGET - assert resp.terminal_url == None - assert resp.terminal_dt == None - assert resp.resources == None + assert resp.terminal_url is None + assert resp.terminal_dt is None + assert resp.resources is None @responses.activate @@ -214,7 +214,7 @@ def test_crawl_resource(spn_client, wayback_client): assert len(responses.calls) == 5 - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.body == WARC_BODY assert resp.cdx.sha1b32 == CDX_BEST_SHA1B32 diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index 6ccf775..9861db2 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -3,7 +3,7 @@ import json import pytest import responses -from sandcrawler import CdxApiClient, CdxApiError, PetaboxError, WaybackClient, WaybackError +from sandcrawler import CdxApiClient, WaybackClient CDX_TARGET = "http://fatcat.wiki/" CDX_DT = "20180812220054" @@ -215,4 +215,4 @@ def test_lookup_resource_success(wayback_client): resp = wayback_client.lookup_resource(CDX_TARGET) - assert resp.hit == True + assert resp.hit is True |