aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-09 17:31:08 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-09 17:31:08 -0800
commit24185837a47f305757a5c783b95ca25b709f66e3 (patch)
treee71e179e93932ad04ba14dbc6308d3e4deb3eeb7 /python/tests
parent00cf33a1c230c8ce5dcda41aba5dcc6a88264d46 (diff)
downloadsandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.tar.gz
sandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.zip
refactor ingest to a loop, allowing multiple hops
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/test_ingest.py11
1 files changed, 9 insertions, 2 deletions
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 8692b21..f5599e9 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -109,12 +109,19 @@ def test_ingest_landing(ingest_worker):
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
body=WARC_BODY)
+ # this is for second time around; don't want to fetch same landing page
+ # HTML again and result in a loop
+ responses.add(responses.GET,
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body="<html></html>")
+
resp = ingest_worker.process(request)
print(resp)
assert resp['hit'] == False
- assert resp['status'] == "wrong-mimetype"
+ assert resp['status'] == "no-pdf-link"
assert resp['request'] == request
assert 'grobid' not in resp
- assert resp['terminal']