aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
commit05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
treeabcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/tests
parentf3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
downloadsandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz
sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip
make fmt
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/test_grobid.py27
-rw-r--r--python/tests/test_grobid2json.py6
-rw-r--r--python/tests/test_html.py10
-rw-r--r--python/tests/test_html_ingest.py1
-rw-r--r--python/tests/test_html_metadata.py32
-rw-r--r--python/tests/test_ingest.py119
-rw-r--r--python/tests/test_live_wayback.py23
-rw-r--r--python/tests/test_misc.py22
-rw-r--r--python/tests/test_pdfextract.py12
-rw-r--r--python/tests/test_pushers.py7
-rw-r--r--python/tests/test_savepagenow.py126
-rw-r--r--python/tests/test_wayback.py100
-rw-r--r--python/tests/test_xml.py3
13 files changed, 294 insertions, 194 deletions
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 7d950df..55636dc 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,4 +1,3 @@
-
import struct
import pytest
@@ -12,20 +11,21 @@ FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
REAL_TEI_XML = f.read()
+
@pytest.fixture
def grobid_client():
- client = GrobidClient(
- host_url="http://dummy-grobid",
- )
+ client = GrobidClient(host_url="http://dummy-grobid", )
return client
+
@responses.activate
def test_grobid_503(grobid_client):
status = b'{"status": "done broke due to 503"}'
responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=503,
- body=status)
+ 'http://dummy-grobid/api/processFulltextDocument',
+ status=503,
+ body=status)
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
@@ -35,12 +35,15 @@ def test_grobid_503(grobid_client):
assert resp['status_code'] == 503
assert resp['status'] == "error"
+
@responses.activate
def test_grobid_success(grobid_client):
responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ 'http://dummy-grobid/api/processFulltextDocument',
+ status=200,
+ body=REAL_TEI_XML,
+ content_type='text/xml')
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
@@ -53,6 +56,7 @@ def test_grobid_success(grobid_client):
#print(type(REAL_TEI_XML))
assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1')
+
@responses.activate
def test_grobid_worker_cdx(grobid_client, wayback_client):
@@ -60,8 +64,10 @@ def test_grobid_worker_cdx(grobid_client, wayback_client):
worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ 'http://dummy-grobid/api/processFulltextDocument',
+ status=200,
+ body=REAL_TEI_XML,
+ content_type='text/xml')
with open('tests/files/example.cdx', 'r') as cdx_file:
pusher = CdxLinePusher(
@@ -76,4 +82,3 @@ def test_grobid_worker_cdx(grobid_client, wayback_client):
assert pusher_counts['pushed'] == worker.counts['total']
assert len(responses.calls) == worker.counts['total']
-
diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index b8999b1..7637871 100644
--- a/python/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
@@ -1,4 +1,3 @@
-
import json
import xml
@@ -8,14 +7,15 @@ from grobid2json import *
def test_small_xml():
-
+
with open('tests/files/small.xml', 'r') as f:
tei_xml = f.read()
with open('tests/files/small.json', 'r') as f:
- json_form = json.loads(f.read())
+ json_form = json.loads(f.read())
assert teixml2json(tei_xml) == json_form
+
def test_invalid_xml():
with pytest.raises(xml.etree.ElementTree.ParseError):
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index d4bffc1..c5f422e 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,4 +1,3 @@
-
import json
import pytest
@@ -13,8 +12,7 @@ def test_extract_fulltext_url():
assert resp == {}
resp = extract_fulltext_url(
- "http://dummy-site/",
- b"""<html>
+ "http://dummy-site/", b"""<html>
<head>
<meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
</head>
@@ -22,8 +20,7 @@ def test_extract_fulltext_url():
<h1>my big article here</h1>
blah
</body>
- </html>"""
- )
+ </html>""")
assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf"
assert resp['technique'] == "citation_pdf_url"
@@ -32,4 +29,5 @@ def test_extract_fulltext_url():
"https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
f.read(),
)
- assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert resp[
+ 'pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index 943e5da..3bf94e2 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -1,4 +1,3 @@
-
import datetime
import pytest
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index 7f35d55..a4c1e41 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -1,4 +1,3 @@
-
import datetime
import pytest
@@ -44,11 +43,12 @@ def test_html_metadata_plos() -> None:
def test_html_metadata_elife() -> None:
-
+
with open('tests/files/elife_article.html', 'r') as f:
elife_html = f.read()
- meta = html_extract_biblio("https://elifesciences.org/articles/44753", HTMLParser(elife_html))
+ meta = html_extract_biblio("https://elifesciences.org/articles/44753",
+ HTMLParser(elife_html))
assert meta is not None
assert meta.title == "Parallel visual circuitry in a basal chordate"
assert meta.doi == "10.7554/eLife.44753"
@@ -69,7 +69,7 @@ def test_html_metadata_elife() -> None:
def test_html_metadata_peerj() -> None:
-
+
with open('tests/files/peerj_oa_article.html', 'r') as f:
peerj_html = f.read()
@@ -78,15 +78,15 @@ def test_html_metadata_peerj() -> None:
assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
assert meta.doi == "10.7717/peerj.4375"
assert meta.contrib_names == [
- "Heather Piwowar",
- "Jason Priem",
- "Vincent Larivière",
- "Juan Pablo Alperin",
- "Lisa Matthias",
- "Bree Norlander",
- "Ashley Farley",
- "Jevin West",
- "Stefanie Haustein",
+ "Heather Piwowar",
+ "Jason Priem",
+ "Vincent Larivière",
+ "Juan Pablo Alperin",
+ "Lisa Matthias",
+ "Bree Norlander",
+ "Ashley Farley",
+ "Jevin West",
+ "Stefanie Haustein",
]
assert meta.container_name == "PeerJ"
# "2018-02-13"
@@ -129,7 +129,7 @@ def test_html_metadata_ojs3() -> None:
"Os Keyes",
]
assert meta.container_name == "First Monday"
- assert meta.container_abbrev == "1" # NOTE: bad source metadata
+ assert meta.container_abbrev == "1" # NOTE: bad source metadata
assert meta.container_issn == "1396-0466"
# "2020/09/10"
assert meta.release_date == datetime.date(year=2020, month=9, day=10)
@@ -150,6 +150,7 @@ def test_html_metadata_dlib() -> None:
# "2017-05-15"
assert meta.release_date == datetime.date(year=2017, month=5, day=15)
+
def test_html_metadata_dc_case() -> None:
"""
This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
@@ -167,10 +168,12 @@ def test_html_metadata_dc_case() -> None:
assert meta is not None
assert meta.issue == "123"
+
@pytest.fixture
def adblock() -> Any:
return load_adblock_rules()
+
def test_html_resources(adblock) -> None:
with open('tests/files/dlib_05vanhyning.html', 'r') as f:
@@ -227,4 +230,3 @@ def test_html_resources(adblock) -> None:
HTMLParser(nature_html),
adblock,
)
-
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 0965fcb..79f50f4 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -1,4 +1,3 @@
-
import json
import pytest
@@ -12,9 +11,7 @@ from sandcrawler import *
@pytest.fixture
def ingest_worker(wayback_client, spn_client):
- grobid_client = GrobidClient(
- host_url="http://dummy-grobid",
- )
+ grobid_client = GrobidClient(host_url="http://dummy-grobid", )
worker = IngestFileWorker(
wayback_client=wayback_client,
spn_client=spn_client,
@@ -22,14 +19,11 @@ def ingest_worker(wayback_client, spn_client):
)
return worker
+
@pytest.fixture
def ingest_worker_pdf(wayback_client_pdf, spn_client):
- grobid_client = GrobidClient(
- host_url="http://dummy-grobid",
- )
- pgrest_client = SandcrawlerPostgrestClient(
- api_url="http://dummy-postgrest",
- )
+ grobid_client = GrobidClient(host_url="http://dummy-grobid", )
+ pgrest_client = SandcrawlerPostgrestClient(api_url="http://dummy-postgrest", )
worker = IngestFileWorker(
wayback_client=wayback_client_pdf,
spn_client=spn_client,
@@ -50,37 +44,45 @@ def test_ingest_success(ingest_worker_pdf):
'base_url': "http://dummy-host/",
}
responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({
+ "url": TARGET,
+ "job_id": JOB_ID
+ }))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(SUCCESS_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SPN_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SPN_HIT))
responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
- status=200,
- headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=pdf_bytes)
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
+ TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=pdf_bytes)
responses.add(responses.GET,
- 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
- status=200,
- body=json.dumps([]))
+ 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"),
+ status=200,
+ body=json.dumps([]))
responses.add(responses.GET,
- 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format("90ffd2359008d82298821d16b21778c5c39aec36"),
- status=200,
- body=json.dumps([]))
+ 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"),
+ status=200,
+ body=json.dumps([]))
responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument', status=200,
- body=REAL_TEI_XML, content_type='text/xml')
+ 'http://dummy-grobid/api/processFulltextDocument',
+ status=200,
+ body=REAL_TEI_XML,
+ content_type='text/xml')
resp = ingest_worker_pdf.process(request)
@@ -108,6 +110,7 @@ def test_ingest_success(ingest_worker_pdf):
assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
assert resp['pdf_meta'].get('text') is None
+
@responses.activate
def test_ingest_landing(ingest_worker):
@@ -116,34 +119,39 @@ def test_ingest_landing(ingest_worker):
'base_url': "http://dummy-host/",
}
responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({
+ "url": TARGET,
+ "job_id": JOB_ID
+ }))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(SUCCESS_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SPN_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SPN_HIT))
responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
- status=200,
- headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
+ TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=WARC_BODY)
# this is for second time around; don't want to fetch same landing page
# HTML again and result in a loop
responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
- status=200,
- headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body="<html></html>")
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
+ TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body="<html></html>")
resp = ingest_worker.process(request)
@@ -157,6 +165,7 @@ def test_ingest_landing(ingest_worker):
assert 'revisit_cdx' not in resp
assert 'grobid' not in resp
+
@responses.activate
def test_ingest_blocklist(ingest_worker):
@@ -192,6 +201,7 @@ def test_ingest_wall_blocklist(ingest_worker):
assert resp['status'] == "skip-wall"
assert resp['request'] == request
+
@responses.activate
def test_ingest_cookie_blocklist(ingest_worker):
@@ -205,4 +215,3 @@ def test_ingest_cookie_blocklist(ingest_worker):
assert resp['hit'] == False
assert resp['status'] == "blocked-cookie"
assert resp['request'] == request
-
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index b501dc3..0ff4902 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -1,4 +1,3 @@
-
"""
This file contains tests to run against "live" wayback services. They default
to "skip" because you need authentication, and we shouldn't hit these services
@@ -11,8 +10,8 @@ import json
import pytest
-from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient, SavePageNowError,
- WaybackClient, WaybackError, gen_file_metadata)
+from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient,
+ SavePageNowError, WaybackClient, WaybackError, gen_file_metadata)
@pytest.fixture
@@ -20,16 +19,19 @@ def cdx_client():
client = CdxApiClient()
return client
+
@pytest.fixture
def wayback_client():
client = WaybackClient()
return client
+
@pytest.fixture
def spn_client():
client = SavePageNowClient()
return client
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch(cdx_client):
@@ -50,6 +52,7 @@ def test_cdx_fetch(cdx_client):
with pytest.raises(KeyError):
resp = cdx_client.fetch(url, "12345678123456")
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_lookup_best(cdx_client):
@@ -68,13 +71,18 @@ def test_cdx_lookup_best(cdx_client):
assert resp.mimetype == "text/html"
assert resp.status_code == 200
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_wayback_fetch(wayback_client):
- resp = wayback_client.fetch_petabox(25683, 2676464871, "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz")
+ resp = wayback_client.fetch_petabox(
+ 25683, 2676464871,
+ "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz"
+ )
assert resp.body
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_resource_success(wayback_client):
@@ -86,6 +94,7 @@ def test_lookup_resource_success(wayback_client):
assert resp.terminal_url in (url, url.replace("https://", "http://"))
assert resp.cdx.url in (url, url.replace("https://", "http://"))
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch_spn2(cdx_client):
@@ -107,8 +116,8 @@ def test_cdx_fetch_spn2(cdx_client):
# https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410
#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
-#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
+ #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209"
datetime = "20200110222410"
@@ -119,6 +128,7 @@ def test_cdx_fetch_spn2(cdx_client):
assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL"
assert resp.status_code == 200
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_ftp(wayback_client):
# ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf
@@ -153,6 +163,7 @@ def test_lookup_ftp(wayback_client):
file_meta = gen_file_metadata(resp.body)
assert file_meta['sha1hex'] == resp.cdx.sha1hex
+
@pytest.mark.skip(reason="hits prod services, requires auth")
def test_crawl_ftp(spn_client, wayback_client):
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 0788c38..dcc1202 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,11 +1,11 @@
-
import pytest
-from sandcrawler import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_line
+from sandcrawler import (b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path,
+ parse_cdx_line)
def test_gen_file_metadata():
-
+
# valid (but very small) PDF file
with open('tests/files/dummy.pdf', 'rb') as f:
file_meta = gen_file_metadata(f.read())
@@ -27,8 +27,9 @@ def test_gen_file_metadata():
assert fm['mimetype'] == 'text/plain'
assert fm['size_bytes'] == 8
+
def test_gen_file_metadata_path():
-
+
# valid (but very small) PDF file
file_meta = gen_file_metadata_path('tests/files/dummy.pdf')
assert file_meta == {
@@ -39,11 +40,14 @@ def test_gen_file_metadata_path():
'size_bytes': 13264,
}
+
def test_b32_hex():
# valid b32
- assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
- assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert b32_hex(
+ 'sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert b32_hex(
+ 'TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
# sha1hex pass-through
s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
@@ -53,6 +57,7 @@ def test_b32_hex():
with pytest.raises(ValueError):
assert b32_hex('blah') == 'blah'
+
def test_parse_cdx_line():
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
@@ -73,6 +78,7 @@ def test_parse_cdx_line():
assert parse_cdx_line(raw + "\n") == correct
assert parse_cdx_line(raw + " extra_field") == correct
+
def test_invalid_cdx():
print("missing warc")
@@ -80,11 +86,11 @@ def test_invalid_cdx():
assert parse_cdx_line(raw) == None
print("bad datetime")
- raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
assert parse_cdx_line(raw) == None
+
def test_clean_url():
assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
"https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
-
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 1d334d6..146b138 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -1,4 +1,3 @@
-
import struct
import poppler
@@ -6,11 +5,13 @@ import pytest
import responses
from test_wayback import cdx_client, wayback_client
-from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker, WaybackClient
+from sandcrawler import (BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker,
+ WaybackClient)
from sandcrawler.pdfextract import process_pdf
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+
def test_process_fake_pdf():
resp = process_pdf(FAKE_PDF_BYTES)
print(resp)
@@ -21,7 +22,9 @@ def test_process_fake_pdf():
resp = process_pdf(pdf_bytes)
assert resp.status == 'not-pdf'
-@pytest.mark.skipif(poppler.version_string() == '0.71.0', reason="unsupported version of poppler")
+
+@pytest.mark.skipif(poppler.version_string() == '0.71.0',
+ reason="unsupported version of poppler")
def test_process_dummy_pdf():
with open('tests/files/dummy.pdf', 'rb') as f:
pdf_bytes = f.read()
@@ -39,6 +42,7 @@ def test_process_dummy_pdf():
assert resp.pdf_extra['page0_width'] == 595
assert resp.pdf_extra['page_count'] == 1
+
def test_pdfextract_worker_cdx(wayback_client):
sink = BlackholeSink()
@@ -56,6 +60,7 @@ def test_pdfextract_worker_cdx(wayback_client):
assert pusher_counts['pushed'] == 7
assert pusher_counts['pushed'] == worker.counts['total']
+
def test_pdfextract_blob_worker():
sink = BlackholeSink()
@@ -65,4 +70,3 @@ def test_pdfextract_blob_worker():
pdf_bytes = f.read()
worker.process(pdf_bytes)
-
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 62fa515..63f90d3 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,4 +1,3 @@
-
import pytest
from sandcrawler.workers import BlackholeSink, CdxLinePusher
@@ -18,8 +17,10 @@ def test_cdx_line_pusher():
# HTTP 200 and application/pdf
with open('tests/files/example.cdx', 'r') as cdx_file:
- pusher = CdxLinePusher(sink, cdx_file,
- filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226])
+ pusher = CdxLinePusher(sink,
+ cdx_file,
+ filter_mimetypes=['application/pdf'],
+ filter_http_statuses=[200, 226])
counts = pusher.run()
assert counts['total'] == 20
assert counts['skip-parse'] == 1
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index f3fbfda..80334d9 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -1,4 +1,3 @@
-
import json
import pytest
@@ -26,9 +25,7 @@ SUCCESS_BODY = {
"timestamp": "20180326070330",
"duration_sec": 6.203,
"resources": [
- TARGET,
- TARGET + "/redirect",
- "http://brewster.kahle.org/",
+ TARGET, TARGET + "/redirect", "http://brewster.kahle.org/",
"http://brewster.kahle.org/favicon.ico",
"http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg",
"http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg",
@@ -43,8 +40,7 @@ SUCCESS_BODY = {
"http://brewster.kahle.org/wp-content/themes/twentyten/style.css",
"http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4",
"http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
- "http://platform.twitter.com/widgets.js",
- "https://archive-it.org/piwik.js",
+ "http://platform.twitter.com/widgets.js", "https://archive-it.org/piwik.js",
"https://platform.twitter.com/jot.html",
"https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js",
"https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html",
@@ -60,7 +56,7 @@ SUCCESS_BODY = {
"https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4",
"https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4"
],
- "outlinks":{
+ "outlinks": {
"https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695",
"https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695"
}
@@ -74,10 +70,18 @@ ERROR_BODY = {
"resources": []
}
CDX_SPN_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"],
+ [
+ "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
+ "robotflags", "length", "offset", "filename"
+ ],
+ [
+ "wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200",
+ CDX_BEST_SHA1B32, "-", "-", "8445", "108062304",
+ "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"
+ ],
]
+
@pytest.fixture
def spn_client():
client = SavePageNowClient(
@@ -88,25 +92,29 @@ def spn_client():
client.poll_seconds = 0.0
return client
+
@responses.activate
def test_savepagenow_success(spn_client):
responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({
+ "url": TARGET,
+ "job_id": JOB_ID
+ }))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(SUCCESS_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
resp = spn_client.save_url_now_v2(TARGET)
@@ -119,21 +127,25 @@ def test_savepagenow_success(spn_client):
assert resp.terminal_dt == SUCCESS_BODY['timestamp']
assert resp.resources == SUCCESS_BODY['resources']
+
@responses.activate
def test_savepagenow_remote_error(spn_client):
responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({
+ "url": TARGET,
+ "job_id": JOB_ID
+ }))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(ERROR_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(ERROR_BODY))
resp = spn_client.save_url_now_v2(TARGET)
@@ -146,47 +158,56 @@ def test_savepagenow_remote_error(spn_client):
assert resp.terminal_dt == None
assert resp.resources == None
+
@responses.activate
def test_savepagenow_500(spn_client):
responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({
+ "url": TARGET,
+ "job_id": JOB_ID
+ }))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=500,
- body=json.dumps(ERROR_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=500,
+ body=json.dumps(ERROR_BODY))
with pytest.raises(SavePageNowError):
resp = spn_client.save_url_now_v2(TARGET)
assert len(responses.calls) == 2
+
@responses.activate
def test_crawl_resource(spn_client, wayback_client):
responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({"url": TARGET, "job_id": JOB_ID}))
+ 'http://dummy-spnv2/save',
+ status=200,
+ body=json.dumps({
+ "url": TARGET,
+ "job_id": JOB_ID
+ }))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY))
responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(SUCCESS_BODY))
+ 'http://dummy-spnv2/save/status/' + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY))
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SPN_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SPN_HIT))
responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
- status=200,
- headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
+ TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=WARC_BODY)
print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"))
resp = spn_client.crawl_resource(TARGET, wayback_client)
@@ -201,4 +222,3 @@ def test_crawl_resource(spn_client, wayback_client):
assert type(resp.cdx) == CdxPartial
with pytest.raises(AttributeError):
print(resp.cdx.warc_path)
-
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 83311b9..6ccf775 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -1,4 +1,3 @@
-
import json
import pytest
@@ -10,27 +9,66 @@ CDX_TARGET = "http://fatcat.wiki/"
CDX_DT = "20180812220054"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_SINGLE_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
+ "robotflags", "length", "offset", "filename"
+ ],
+ [
+ "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
]
CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_MULTI_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner, but not right mimetype
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner and mimetype, but wrong status code
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # "best"
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # older
- ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
+ "robotflags", "length", "offset", "filename"
+ ],
+ [
+ "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ # sooner, but not right mimetype
+ [
+ "wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ # sooner and mimetype, but wrong status code
+ [
+ "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ [
+ "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ [
+ "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ # "best"
+ [
+ "wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-",
+ "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ # older
+ [
+ "wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
]
+
@pytest.fixture
def cdx_client():
client = CdxApiClient(
@@ -39,13 +77,14 @@ def cdx_client():
)
return client
+
@responses.activate
def test_cdx_fetch(cdx_client):
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
@@ -58,6 +97,7 @@ def test_cdx_fetch(cdx_client):
assert resp.warc_offset == 108062304
assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
@responses.activate
def test_cdx_fetch_errors(cdx_client):
@@ -65,9 +105,9 @@ def test_cdx_fetch_errors(cdx_client):
resp = cdx_client.fetch(CDX_TARGET, "2019")
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
with pytest.raises(KeyError):
resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
@@ -78,13 +118,14 @@ def test_cdx_fetch_errors(cdx_client):
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
assert len(responses.calls) == 3
+
@responses.activate
def test_cdx_lookup_best(cdx_client):
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_MULTI_HIT))
resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
@@ -95,6 +136,7 @@ def test_cdx_lookup_best(cdx_client):
assert resp.sha1b32 == CDX_BEST_SHA1B32
assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
WARC_TARGET = "http://fatcat.wiki/"
WARC_BODY = b"""
<html>
@@ -108,6 +150,7 @@ WARC_BODY = b"""
</html>
"""
+
@pytest.fixture
def wayback_client(cdx_client, mocker):
client = WaybackClient(
@@ -127,6 +170,7 @@ def wayback_client(cdx_client, mocker):
return client
+
@pytest.fixture
def wayback_client_pdf(cdx_client, mocker):
@@ -150,6 +194,7 @@ def wayback_client_pdf(cdx_client, mocker):
return client
+
@responses.activate
def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
@@ -159,13 +204,14 @@ def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
assert resp == WARC_BODY
+
@responses.activate
def test_lookup_resource_success(wayback_client):
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_MULTI_HIT))
resp = wayback_client.lookup_resource(CDX_TARGET)
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py
index a996c56..1742f3a 100644
--- a/python/tests/test_xml.py
+++ b/python/tests/test_xml.py
@@ -1,11 +1,10 @@
-
import pytest
from sandcrawler.xml import xml_reserialize
def test_xml_reserialize() -> None:
-
+
with open('tests/files/scielo_article.jats.xml', 'rb') as f:
raw_xml = f.read()