aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/test_grobid.py66
-rw-r--r--python/tests/test_grobid2json.py4
-rw-r--r--python/tests/test_html.py18
-rw-r--r--python/tests/test_html_ingest.py4
-rw-r--r--python/tests/test_html_metadata.py77
-rw-r--r--python/tests/test_ingest.py273
-rw-r--r--python/tests/test_live_wayback.py28
-rw-r--r--python/tests/test_misc.py86
-rw-r--r--python/tests/test_pdfextract.py35
-rw-r--r--python/tests/test_pushers.py30
-rw-r--r--python/tests/test_savepagenow.py206
-rw-r--r--python/tests/test_wayback.py168
-rw-r--r--python/tests/test_xml.py2
13 files changed, 595 insertions, 402 deletions
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 15d43fb..c086d73 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -8,13 +8,15 @@ from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker
FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
-with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f:
+with open("tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml", "rb") as f:
REAL_TEI_XML = f.read()
@pytest.fixture
def grobid_client():
- client = GrobidClient(host_url="http://dummy-grobid", )
+ client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
return client
@@ -22,39 +24,43 @@ def grobid_client():
def test_grobid_503(grobid_client):
status = b'{"status": "done broke due to 503"}'
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument',
- status=503,
- body=status)
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=503,
+ body=status,
+ )
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 503
- assert resp['status'] == "error"
+ assert resp["status_code"] == 503
+ assert resp["status"] == "error"
@responses.activate
def test_grobid_success(grobid_client):
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument',
- status=200,
- body=REAL_TEI_XML,
- content_type='text/xml')
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
resp = grobid_client.process_fulltext(FAKE_PDF_BYTES)
# grobid gets POST 1x times
assert len(responses.calls) == 1
- assert resp['status_code'] == 200
- assert resp['status'] == "success"
- #print(type(resp['tei_xml']))
- #print(type(REAL_TEI_XML))
- assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1')
+ assert resp["status_code"] == 200
+ assert resp["status"] == "success"
+ # print(type(resp['tei_xml']))
+ # print(type(REAL_TEI_XML))
+ assert resp["tei_xml"] == REAL_TEI_XML.decode("ISO-8859-1")
@responses.activate
@@ -63,22 +69,24 @@ def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811
sink = BlackholeSink()
worker = GrobidWorker(grobid_client, wayback_client, sink=sink)
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument',
- status=200,
- body=REAL_TEI_XML,
- content_type='text/xml')
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
- assert len(responses.calls) == worker.counts['total']
+ assert len(responses.calls) == worker.counts["total"]
diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index 7637871..98888e8 100644
--- a/python/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
@@ -8,9 +8,9 @@ from grobid2json import *
def test_small_xml():
- with open('tests/files/small.xml', 'r') as f:
+ with open("tests/files/small.xml", "r") as f:
tei_xml = f.read()
- with open('tests/files/small.json', 'r') as f:
+ with open("tests/files/small.json", "r") as f:
json_form = json.loads(f.read())
assert teixml2json(tei_xml) == json_form
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 1caca15..614b802 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -7,7 +7,8 @@ def test_extract_fulltext_url():
assert resp == {}
resp = extract_fulltext_url(
- "http://dummy-site/", b"""<html>
+ "http://dummy-site/",
+ b"""<html>
<head>
<meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
</head>
@@ -15,14 +16,17 @@ def test_extract_fulltext_url():
<h1>my big article here</h1>
blah
</body>
- </html>""")
- assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf"
- assert resp['technique'] == "citation_pdf_url"
+ </html>""",
+ )
+ assert resp["pdf_url"] == "http://www.example.com/content/271/20/11761.full.pdf"
+ assert resp["technique"] == "citation_pdf_url"
- with open('tests/files/plos_one_article.html', 'rb') as f:
+ with open("tests/files/plos_one_article.html", "rb") as f:
resp = extract_fulltext_url(
"https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
f.read(),
)
- assert resp[
- 'pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert (
+ resp["pdf_url"]
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index 727fef9..ba4acf1 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -3,8 +3,8 @@ from sandcrawler.ingest_html import *
def test_html_extract_ojs3() -> None:
- with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f:
+ with open("tests/files/first_monday_ojs3_fulltext.html", "rb") as f:
ojs3_html = f.read()
fulltext = html_extract_body_teixml(ojs3_html)
- assert fulltext['status'] == 'success'
+ assert fulltext["status"] == "success"
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index a4c1e41..69bd211 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -7,14 +7,20 @@ from sandcrawler.html_metadata import *
def test_html_metadata_plos() -> None:
- with open('tests/files/plos_one_article.html', 'r') as f:
+ with open("tests/files/plos_one_article.html", "r") as f:
plos_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(plos_html))
assert meta is not None
- assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ assert (
+ meta.title
+ == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
+ )
assert meta.doi == "10.1371/journal.pone.0213978"
- assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
assert meta.contrib_names == [
"Yang Li",
"Tuanjie Wang",
@@ -37,18 +43,26 @@ def test_html_metadata_plos() -> None:
assert meta.volume == "14"
assert meta.container_issn == "1932-6203"
assert meta.publisher == "Public Library of Science"
- assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
+ assert (
+ meta.raw_references
+ and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;"
+ in meta.raw_references
+ )
assert meta.release_type == "article-journal"
- assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
+ )
def test_html_metadata_elife() -> None:
- with open('tests/files/elife_article.html', 'r') as f:
+ with open("tests/files/elife_article.html", "r") as f:
elife_html = f.read()
- meta = html_extract_biblio("https://elifesciences.org/articles/44753",
- HTMLParser(elife_html))
+ meta = html_extract_biblio(
+ "https://elifesciences.org/articles/44753", HTMLParser(elife_html)
+ )
assert meta is not None
assert meta.title == "Parallel visual circuitry in a basal chordate"
assert meta.doi == "10.7554/eLife.44753"
@@ -65,17 +79,23 @@ def test_html_metadata_elife() -> None:
# 2019-04-18
assert meta.release_date == datetime.date(year=2019, month=4, day=18)
assert meta.publisher == "eLife Sciences Publications Limited"
- assert meta.pdf_fulltext_url == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
+ assert (
+ meta.pdf_fulltext_url
+ == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
+ )
def test_html_metadata_peerj() -> None:
- with open('tests/files/peerj_oa_article.html', 'r') as f:
+ with open("tests/files/peerj_oa_article.html", "r") as f:
peerj_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html))
assert meta is not None
- assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ assert (
+ meta.title
+ == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+ )
assert meta.doi == "10.7717/peerj.4375"
assert meta.contrib_names == [
"Heather Piwowar",
@@ -96,7 +116,7 @@ def test_html_metadata_peerj() -> None:
def test_html_metadata_nature() -> None:
- with open('tests/files/nature_article.html', 'r') as f:
+ with open("tests/files/nature_article.html", "r") as f:
nature_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(nature_html))
@@ -111,12 +131,15 @@ def test_html_metadata_nature() -> None:
assert meta.release_date == datetime.date(year=2020, month=9, day=10)
assert meta.publisher == "Nature Publishing Group"
# note: some error in dublin code in nature HTML resulting in duplication
- assert meta.abstract == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+ assert (
+ meta.abstract
+ == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."
+ )
def test_html_metadata_ojs3() -> None:
- with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
ojs3_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html))
@@ -134,14 +157,20 @@ def test_html_metadata_ojs3() -> None:
# "2020/09/10"
assert meta.release_date == datetime.date(year=2020, month=9, day=10)
assert meta.lang == "en"
- assert meta.abstract == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
- assert meta.html_fulltext_url == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ assert (
+ meta.abstract
+ == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
+ )
+ assert (
+ meta.html_fulltext_url
+ == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
+ )
assert meta.release_type == "article-journal"
def test_html_metadata_dlib() -> None:
- with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
dlib_html = f.read()
meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html))
@@ -176,7 +205,7 @@ def adblock() -> Any:
def test_html_resources(adblock) -> None:
- with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+ with open("tests/files/dlib_05vanhyning.html", "r") as f:
dlib_html = f.read()
resources = html_extract_resources(
@@ -189,9 +218,9 @@ def test_html_resources(adblock) -> None:
# check that adblock working
for r in resources:
- assert '/ga.js' not in r['url']
+ assert "/ga.js" not in r["url"]
- with open('tests/files/plos_one_article.html', 'r') as f:
+ with open("tests/files/plos_one_article.html", "r") as f:
plos_html = f.read()
resources = html_extract_resources(
@@ -202,9 +231,9 @@ def test_html_resources(adblock) -> None:
# check that custom adblock working
for r in resources:
- assert 'crossmark-cdn.crossref.org' not in r['url']
+ assert "crossmark-cdn.crossref.org" not in r["url"]
- with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+ with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f:
monday_html = f.read()
resources = html_extract_resources(
@@ -213,7 +242,7 @@ def test_html_resources(adblock) -> None:
adblock,
)
- with open('tests/files/elife_article.html', 'r') as f:
+ with open("tests/files/elife_article.html", "r") as f:
elife_html = f.read()
resources = html_extract_resources(
@@ -222,7 +251,7 @@ def test_html_resources(adblock) -> None:
adblock,
)
- with open('tests/files/nature_article.html', 'r') as f:
+ with open("tests/files/nature_article.html", "r") as f:
nature_html = f.read()
resources = html_extract_resources(
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 617f2b4..ad8c22e 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -11,7 +11,9 @@ from sandcrawler import *
@pytest.fixture
def ingest_worker(wayback_client, spn_client):
- grobid_client = GrobidClient(host_url="http://dummy-grobid", )
+ grobid_client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
worker = IngestFileWorker(
wayback_client=wayback_client,
spn_client=spn_client,
@@ -22,8 +24,12 @@ def ingest_worker(wayback_client, spn_client):
@pytest.fixture
def ingest_worker_pdf(wayback_client_pdf, spn_client):
- grobid_client = GrobidClient(host_url="http://dummy-grobid", )
- pgrest_client = SandcrawlerPostgrestClient(api_url="http://dummy-postgrest", )
+ grobid_client = GrobidClient(
+ host_url="http://dummy-grobid",
+ )
+ pgrest_client = SandcrawlerPostgrestClient(
+ api_url="http://dummy-postgrest",
+ )
worker = IngestFileWorker(
wayback_client=wayback_client_pdf,
spn_client=spn_client,
@@ -36,182 +42,197 @@ def ingest_worker_pdf(wayback_client_pdf, spn_client):
@responses.activate
def test_ingest_success(ingest_worker_pdf):
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({
- "url": TARGET,
- "job_id": JOB_ID
- }))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
- TARGET + "/redirect"),
- status=200,
- headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=pdf_bytes)
- responses.add(responses.GET,
- 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format(
- "90ffd2359008d82298821d16b21778c5c39aec36"),
- status=200,
- body=json.dumps([]))
- responses.add(responses.GET,
- 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format(
- "90ffd2359008d82298821d16b21778c5c39aec36"),
- status=200,
- body=json.dumps([]))
- responses.add(responses.POST,
- 'http://dummy-grobid/api/processFulltextDocument',
- status=200,
- body=REAL_TEI_XML,
- content_type='text/xml')
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=pdf_bytes,
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/grobid?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
+ status=200,
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-postgrest/pdf_meta?sha1hex=eq.{}".format(
+ "90ffd2359008d82298821d16b21778c5c39aec36"
+ ),
+ status=200,
+ body=json.dumps([]),
+ )
+ responses.add(
+ responses.POST,
+ "http://dummy-grobid/api/processFulltextDocument",
+ status=200,
+ body=REAL_TEI_XML,
+ content_type="text/xml",
+ )
resp = ingest_worker_pdf.process(request)
print(resp)
- assert resp['hit'] is True
- assert resp['status'] == "success"
- assert resp['request'] == request
- assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
- assert type(resp['terminal']['terminal_dt']) == str
- assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
- assert resp['terminal']['terminal_status_code']
- assert type(resp['file_meta']['size_bytes']) == int
- assert resp['file_meta']['mimetype'] == "application/pdf"
- assert resp['cdx']['url'] == TARGET + "/redirect"
- assert 'warc_path' not in resp['cdx']
- assert 'revisit_cdx' not in resp
- assert resp['grobid']['status'] == "success"
- assert resp['grobid']['status_code'] == 200
- assert resp['grobid']['grobid_version']
- assert 'fatcat_release' in resp['grobid']
- assert 'grobid_version' not in resp['grobid']['metadata']
- assert 'fatcat_release' not in resp['grobid']['metadata']
- assert 'tei_xml' not in resp['grobid']
- assert resp['pdf_meta']['status'] == "success"
- assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
- assert resp['pdf_meta'].get('text') is None
+ assert resp["hit"] is True
+ assert resp["status"] == "success"
+ assert resp["request"] == request
+ assert resp["terminal"]["terminal_sha1hex"] == resp["file_meta"]["sha1hex"]
+ assert type(resp["terminal"]["terminal_dt"]) == str
+ assert resp["terminal"]["terminal_url"] == TARGET + "/redirect"
+ assert resp["terminal"]["terminal_status_code"]
+ assert type(resp["file_meta"]["size_bytes"]) == int
+ assert resp["file_meta"]["mimetype"] == "application/pdf"
+ assert resp["cdx"]["url"] == TARGET + "/redirect"
+ assert "warc_path" not in resp["cdx"]
+ assert "revisit_cdx" not in resp
+ assert resp["grobid"]["status"] == "success"
+ assert resp["grobid"]["status_code"] == 200
+ assert resp["grobid"]["grobid_version"]
+ assert "fatcat_release" in resp["grobid"]
+ assert "grobid_version" not in resp["grobid"]["metadata"]
+ assert "fatcat_release" not in resp["grobid"]["metadata"]
+ assert "tei_xml" not in resp["grobid"]
+ assert resp["pdf_meta"]["status"] == "success"
+ assert resp["pdf_meta"]["pdf_extra"]["page_count"] == 1
+ assert resp["pdf_meta"].get("text") is None
@responses.activate
def test_ingest_landing(ingest_worker):
request = {
- 'ingest_type': 'pdf',
- 'base_url': "http://dummy-host/",
+ "ingest_type": "pdf",
+ "base_url": "http://dummy-host/",
}
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({
- "url": TARGET,
- "job_id": JOB_ID
- }))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
- TARGET + "/redirect"),
- status=200,
- headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=WARC_BODY,
+ )
# this is for second time around; don't want to fetch same landing page
# HTML again and result in a loop
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
- TARGET + "/redirect"),
- status=200,
- headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body="<html></html>")
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body="<html></html>",
+ )
resp = ingest_worker.process(request)
print(resp)
- assert resp['hit'] is False
- assert resp['status'] == "no-pdf-link"
- assert resp['request'] == request
- assert 'terminal' in resp
- assert 'file_meta' not in resp
- assert 'cdx' not in resp
- assert 'revisit_cdx' not in resp
- assert 'grobid' not in resp
+ assert resp["hit"] is False
+ assert resp["status"] == "no-pdf-link"
+ assert resp["request"] == request
+ assert "terminal" in resp
+ assert "file_meta" not in resp
+ assert "cdx" not in resp
+ assert "revisit_cdx" not in resp
+ assert "grobid" not in resp
@responses.activate
def test_ingest_blocklist(ingest_worker):
ingest_worker.base_url_blocklist = [
- '://test.fatcat.wiki/',
+ "://test.fatcat.wiki/",
]
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
}
resp = ingest_worker.process(request)
- assert resp['hit'] is False
- assert resp['status'] == "skip-url-blocklist"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-url-blocklist"
+ assert resp["request"] == request
@responses.activate
def test_ingest_wall_blocklist(ingest_worker):
ingest_worker.wall_blocklist = [
- '://test.fatcat.wiki/',
+ "://test.fatcat.wiki/",
]
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/asdfasdf.pdf",
}
resp = ingest_worker.process(request)
- assert resp['hit'] is False
- assert resp['status'] == "skip-wall"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "skip-wall"
+ assert resp["request"] == request
@responses.activate
def test_ingest_cookie_blocklist(ingest_worker):
request = {
- 'ingest_type': 'pdf',
- 'base_url': "https://test.fatcat.wiki/cookieAbsent",
+ "ingest_type": "pdf",
+ "base_url": "https://test.fatcat.wiki/cookieAbsent",
}
resp = ingest_worker.process(request)
- assert resp['hit'] is False
- assert resp['status'] == "blocked-cookie"
- assert resp['request'] == request
+ assert resp["hit"] is False
+ assert resp["status"] == "blocked-cookie"
+ assert resp["request"] == request
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index f82eac4..9bd8b5f 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -43,7 +43,10 @@ def test_cdx_fetch(cdx_client):
assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV"
assert resp.warc_csize == 25338
assert resp.warc_offset == 240665973
- assert resp.warc_path == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+ assert (
+ resp.warc_path
+ == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+ )
# bogus datetime; shouldn't match
with pytest.raises(KeyError):
@@ -73,8 +76,9 @@ def test_cdx_lookup_best(cdx_client):
def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox(
- 25683, 2676464871,
- "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz"
+ 25683,
+ 2676464871,
+ "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz",
)
assert resp.body
@@ -112,9 +116,9 @@ def test_cdx_fetch_spn2(cdx_client):
# https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410
- #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
- #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
- #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
+ # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz
url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209"
datetime = "20200110222410"
@@ -145,7 +149,7 @@ def test_lookup_ftp(wayback_client):
assert resp.revisit_cdx.url != url
file_meta = gen_file_metadata(resp.body)
- assert file_meta['sha1hex'] == resp.cdx.sha1hex
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
# not revisit?
url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
@@ -158,7 +162,7 @@ def test_lookup_ftp(wayback_client):
assert resp.cdx.url == url
file_meta = gen_file_metadata(resp.body)
- assert file_meta['sha1hex'] == resp.cdx.sha1hex
+ assert file_meta["sha1hex"] == resp.cdx.sha1hex
@pytest.mark.skip(reason="hits prod services, requires auth")
@@ -168,10 +172,10 @@ def test_crawl_ftp(spn_client, wayback_client):
resp = spn_client.crawl_resource(url, wayback_client)
# FTP isn't supported yet!
- #assert resp.hit is True
- #assert resp.status == "success"
- #assert resp.terminal_url == url
- #assert resp.cdx.url == url
+ # assert resp.hit is True
+ # assert resp.status == "success"
+ # assert resp.terminal_url == url
+ # assert resp.cdx.url == url
assert resp.hit is False
assert resp.status == "spn2-no-ftp"
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 5830dc9..2bad851 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,77 +1,87 @@
import pytest
-from sandcrawler import (b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path,
- parse_cdx_line)
+from sandcrawler import (
+ b32_hex,
+ clean_url,
+ gen_file_metadata,
+ gen_file_metadata_path,
+ parse_cdx_line,
+)
def test_gen_file_metadata():
# valid (but very small) PDF file
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
file_meta = gen_file_metadata(f.read())
assert file_meta == {
- 'mimetype': 'application/pdf',
- 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
- 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
- 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
- 'size_bytes': 13264,
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
}
# valid HTML
fm = gen_file_metadata(
- b"""<html><head><title>dummy</title></head><body>html document</body></html>""")
- assert fm['mimetype'] == 'text/html'
+ b"""<html><head><title>dummy</title></head><body>html document</body></html>"""
+ )
+ assert fm["mimetype"] == "text/html"
# bogus text
fm = gen_file_metadata(b"asdf1234")
- assert fm['mimetype'] == 'text/plain'
- assert fm['size_bytes'] == 8
+ assert fm["mimetype"] == "text/plain"
+ assert fm["size_bytes"] == 8
def test_gen_file_metadata_path():
# valid (but very small) PDF file
- file_meta = gen_file_metadata_path('tests/files/dummy.pdf')
+ file_meta = gen_file_metadata_path("tests/files/dummy.pdf")
assert file_meta == {
- 'mimetype': 'application/pdf',
- 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
- 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
- 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
- 'size_bytes': 13264,
+ "mimetype": "application/pdf",
+ "md5hex": "2942bfabb3d05332b66eb128e0842cff",
+ "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36",
+ "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4",
+ "size_bytes": 13264,
}
def test_b32_hex():
# valid b32
- assert b32_hex(
- 'sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
- assert b32_hex(
- 'TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert (
+ b32_hex("sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
+ assert (
+ b32_hex("TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC")
+ == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982"
+ )
# sha1hex pass-through
- s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
+ s = "bda3c1017d52e826bbd1da51efad877272d300f9"
assert b32_hex(s) == s
# invalid
with pytest.raises(ValueError):
- assert b32_hex('blah') == 'blah'
+ assert b32_hex("blah") == "blah"
def test_parse_cdx_line():
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
correct = {
- 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
- 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
- 'mimetype': "application/pdf",
- 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
- 'datetime': "20170828233154",
- 'warc_path': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'warc_offset': 931661233,
- 'warc_csize': 210251,
- 'http_status': 200,
+ "sha1b32": "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ "sha1hex": "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+ "mimetype": "application/pdf",
+ "surt": "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "url": "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ "datetime": "20170828233154",
+ "warc_path": "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ "warc_offset": 931661233,
+ "warc_csize": 210251,
+ "http_status": 200,
}
assert parse_cdx_line(raw) == correct
@@ -92,5 +102,9 @@ def test_invalid_cdx():
def test_clean_url():
assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
- assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
- "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+ assert (
+ clean_url(
+ "https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
+ == "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+ )
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 086243a..9d75655 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -15,30 +15,31 @@ def test_process_fake_pdf():
print(resp)
assert resp.status == "not-pdf"
- with open('tests/files/dummy_zip.zip', 'rb') as f:
+ with open("tests/files/dummy_zip.zip", "rb") as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
- assert resp.status == 'not-pdf'
+ assert resp.status == "not-pdf"
-@pytest.mark.skipif(poppler.version_string() == '0.71.0',
- reason="unsupported version of poppler")
+@pytest.mark.skipif(
+ poppler.version_string() == "0.71.0", reason="unsupported version of poppler"
+)
def test_process_dummy_pdf():
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
resp = process_pdf(pdf_bytes)
- assert resp.status == 'success'
+ assert resp.status == "success"
assert resp.page0_thumbnail is not None
assert len(resp.text) > 10
assert resp.meta_xml is None
- assert resp.file_meta['mimetype'] == 'application/pdf'
+ assert resp.file_meta["mimetype"] == "application/pdf"
print(resp.pdf_info)
print(resp.pdf_extra)
- assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis"
+ assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis"
# 595 x 842
- assert resp.pdf_extra['page0_height'] == 842
- assert resp.pdf_extra['page0_width'] == 595
- assert resp.pdf_extra['page_count'] == 1
+ assert resp.pdf_extra["page0_height"] == 842
+ assert resp.pdf_extra["page0_width"] == 595
+ assert resp.pdf_extra["page_count"] == 1
def test_pdfextract_worker_cdx(wayback_client): # noqa: F811
@@ -46,17 +47,17 @@ def test_pdfextract_worker_cdx(wayback_client): # noqa: F811
sink = BlackholeSink()
worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink)
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(
worker,
cdx_file,
filter_http_statuses=[200, 226],
- filter_mimetypes=['application/pdf'],
+ filter_mimetypes=["application/pdf"],
)
pusher_counts = pusher.run()
- assert pusher_counts['total']
- assert pusher_counts['pushed'] == 7
- assert pusher_counts['pushed'] == worker.counts['total']
+ assert pusher_counts["total"]
+ assert pusher_counts["pushed"] == 7
+ assert pusher_counts["pushed"] == worker.counts["total"]
def test_pdfextract_blob_worker():
@@ -64,7 +65,7 @@ def test_pdfextract_blob_worker():
sink = BlackholeSink()
worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink)
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
worker.process(pdf_bytes)
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 353a560..ed17d24 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -6,22 +6,24 @@ def test_cdx_line_pusher():
sink = BlackholeSink()
# vanilla (only default filters)
- with open('tests/files/example.cdx', 'r') as cdx_file:
+ with open("tests/files/example.cdx", "r") as cdx_file:
pusher = CdxLinePusher(sink, cdx_file)
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['pushed'] == 19
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["pushed"] == 19
# HTTP 200 and application/pdf
- with open('tests/files/example.cdx', 'r') as cdx_file:
- pusher = CdxLinePusher(sink,
- cdx_file,
- filter_mimetypes=['application/pdf'],
- filter_http_statuses=[200, 226])
+ with open("tests/files/example.cdx", "r") as cdx_file:
+ pusher = CdxLinePusher(
+ sink,
+ cdx_file,
+ filter_mimetypes=["application/pdf"],
+ filter_http_statuses=[200, 226],
+ )
counts = pusher.run()
- assert counts['total'] == 20
- assert counts['skip-parse'] == 1
- assert counts['skip-http_status'] == 10
- assert counts['skip-mimetype'] == 2
- assert counts['pushed'] == 7
+ assert counts["total"] == 20
+ assert counts["skip-parse"] == 1
+ assert counts["skip-http_status"] == 10
+ assert counts["skip-mimetype"] == 2
+ assert counts["pushed"] == 7
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 50cabb4..f3a5e46 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -15,7 +15,7 @@ PENDING_BODY = {
"https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js",
"https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js",
"https://cdn.onesignal.com/sdks/OneSignalSDK.js",
- ]
+ ],
}
SUCCESS_BODY = {
"status": "success",
@@ -25,7 +25,9 @@ SUCCESS_BODY = {
"timestamp": "20180326070330",
"duration_sec": 6.203,
"resources": [
- TARGET, TARGET + "/redirect", "http://brewster.kahle.org/",
+ TARGET,
+ TARGET + "/redirect",
+ "http://brewster.kahle.org/",
"http://brewster.kahle.org/favicon.ico",
"http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg",
"http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg",
@@ -40,7 +42,8 @@ SUCCESS_BODY = {
"http://brewster.kahle.org/wp-content/themes/twentyten/style.css",
"http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4",
"http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
- "http://platform.twitter.com/widgets.js", "https://archive-it.org/piwik.js",
+ "http://platform.twitter.com/widgets.js",
+ "https://archive-it.org/piwik.js",
"https://platform.twitter.com/jot.html",
"https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js",
"https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html",
@@ -54,12 +57,12 @@ SUCCESS_BODY = {
"https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png",
"https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1",
"https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4",
- "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4"
+ "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4",
],
"outlinks": {
"https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695",
- "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695"
- }
+ "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695",
+ },
}
ERROR_BODY = {
"status": "error",
@@ -67,17 +70,34 @@ ERROR_BODY = {
"status_ext": "error:invalid-host-resolution",
"job_id": JOB_ID,
"message": "Couldn't resolve host for http://example5123.com.",
- "resources": []
+ "resources": [],
}
CDX_SPN_HIT = [
[
- "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
- "robotflags", "length", "offset", "filename"
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
],
[
- "wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200",
- CDX_BEST_SHA1B32, "-", "-", "8445", "108062304",
- "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz"
+ "wiki,fatcat)/",
+ "20180326070330",
+ TARGET + "/redirect",
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz",
],
]
@@ -96,25 +116,30 @@ def spn_client():
@responses.activate
def test_savepagenow_success(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({
- "url": TARGET,
- "job_id": JOB_ID
- }))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(SUCCESS_BODY))
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY),
+ )
resp = spn_client.save_url_now_v2(TARGET)
@@ -124,35 +149,38 @@ def test_savepagenow_success(spn_client):
assert resp.status == "success"
assert resp.request_url == TARGET
assert resp.terminal_url == TARGET + "/redirect"
- assert resp.terminal_dt == SUCCESS_BODY['timestamp']
- assert resp.resources == SUCCESS_BODY['resources']
+ assert resp.terminal_dt == SUCCESS_BODY["timestamp"]
+ assert resp.resources == SUCCESS_BODY["resources"]
@responses.activate
def test_savepagenow_remote_error(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({
- "url": TARGET,
- "job_id": JOB_ID
- }))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(ERROR_BODY))
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(ERROR_BODY),
+ )
resp = spn_client.save_url_now_v2(TARGET)
assert len(responses.calls) == 3
assert resp.success is False
- assert resp.status == ERROR_BODY['status_ext']
+ assert resp.status == ERROR_BODY["status_ext"]
assert resp.request_url == TARGET
assert resp.terminal_url is None
assert resp.terminal_dt is None
@@ -162,17 +190,18 @@ def test_savepagenow_remote_error(spn_client):
@responses.activate
def test_savepagenow_500(spn_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({
- "url": TARGET,
- "job_id": JOB_ID
- }))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=500,
- body=json.dumps(ERROR_BODY))
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=500,
+ body=json.dumps(ERROR_BODY),
+ )
with pytest.raises(SavePageNowError):
spn_client.save_url_now_v2(TARGET)
@@ -183,33 +212,36 @@ def test_savepagenow_500(spn_client):
@responses.activate
def test_crawl_resource(spn_client, wayback_client):
- responses.add(responses.POST,
- 'http://dummy-spnv2/save',
- status=200,
- body=json.dumps({
- "url": TARGET,
- "job_id": JOB_ID
- }))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(PENDING_BODY))
- responses.add(responses.GET,
- 'http://dummy-spnv2/save/status/' + JOB_ID,
- status=200,
- body=json.dumps(SUCCESS_BODY))
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SPN_HIT))
- responses.add(responses.GET,
- 'https://web.archive.org/web/{}id_/{}'.format("20180326070330",
- TARGET + "/redirect"),
- status=200,
- headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
- body=WARC_BODY)
-
- print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"))
+ responses.add(
+ responses.POST,
+ "http://dummy-spnv2/save",
+ status=200,
+ body=json.dumps({"url": TARGET, "job_id": JOB_ID}),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(PENDING_BODY),
+ )
+ responses.add(
+ responses.GET,
+ "http://dummy-spnv2/save/status/" + JOB_ID,
+ status=200,
+ body=json.dumps(SUCCESS_BODY),
+ )
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT)
+ )
+ responses.add(
+ responses.GET,
+ "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body=WARC_BODY,
+ )
+
+ print("https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"))
resp = spn_client.crawl_resource(TARGET, wayback_client)
assert len(responses.calls) == 5
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 0cb59fa..da4dfd8 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -10,13 +10,30 @@ CDX_DT = "20180812220054"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_SINGLE_HIT = [
[
- "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
- "robotflags", "length", "offset", "filename"
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
],
[
- "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200",
- "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
- "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
],
]
@@ -24,47 +41,112 @@ CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_MULTI_HIT = [
[
- "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
- "robotflags", "length", "offset", "filename"
+ "urlkey",
+ "timestamp",
+ "original",
+ "mimetype",
+ "statuscode",
+ "digest",
+ "redirect",
+ "robotflags",
+ "length",
+ "offset",
+ "filename",
],
[
- "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200",
- "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
- "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
],
# sooner, but not right mimetype
[
- "wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200",
- "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
- "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "text/html",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
],
# sooner and mimetype, but wrong status code
[
- "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400",
- "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
- "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "400",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
],
[
- "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500",
- "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
- "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "500",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
],
[
- "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150",
- "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
- "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ "wiki,fatcat)/",
+ "20180912220054",
+ CDX_TARGET,
+ "application/pdf",
+ "150",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
],
# "best"
[
- "wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-",
- "-", "8445", "108062304",
- "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ "wiki,fatcat)/",
+ CDX_DT,
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ CDX_BEST_SHA1B32,
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
],
# older
[
- "wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200",
- "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
- "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ "wiki,fatcat)/",
+ "20180712220054",
+ CDX_TARGET,
+ "application/pdf",
+ "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR",
+ "-",
+ "-",
+ "8445",
+ "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz",
],
]
@@ -81,10 +163,9 @@ def cdx_client():
@responses.activate
def test_cdx_fetch(cdx_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
@@ -104,10 +185,9 @@ def test_cdx_fetch_errors(cdx_client):
with pytest.raises(ValueError):
resp = cdx_client.fetch(CDX_TARGET, "2019")
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT)
+ )
with pytest.raises(KeyError):
resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
@@ -123,10 +203,9 @@ def test_cdx_fetch_errors(cdx_client):
@responses.activate
def test_cdx_lookup_best(cdx_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
@@ -175,7 +254,7 @@ def wayback_client(cdx_client, mocker):
@pytest.fixture
def wayback_client_pdf(cdx_client, mocker):
- with open('tests/files/dummy.pdf', 'rb') as f:
+ with open("tests/files/dummy.pdf", "rb") as f:
pdf_bytes = f.read()
client = WaybackClient(
@@ -209,10 +288,9 @@ def test_wayback_fetch(wayback_client):
@responses.activate
def test_lookup_resource_success(wayback_client):
- responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ responses.add(
+ responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT)
+ )
resp = wayback_client.lookup_resource(CDX_TARGET)
diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py
index 1742f3a..786f863 100644
--- a/python/tests/test_xml.py
+++ b/python/tests/test_xml.py
@@ -5,7 +5,7 @@ from sandcrawler.xml import xml_reserialize
def test_xml_reserialize() -> None:
- with open('tests/files/scielo_article.jats.xml', 'rb') as f:
+ with open("tests/files/scielo_article.jats.xml", "rb") as f:
raw_xml = f.read()
assert b'encoding="ISO-8859-1"' in raw_xml