diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-09 17:31:08 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-09 17:31:08 -0800 |
commit | 24185837a47f305757a5c783b95ca25b709f66e3 (patch) | |
tree | e71e179e93932ad04ba14dbc6308d3e4deb3eeb7 /python/tests | |
parent | 00cf33a1c230c8ce5dcda41aba5dcc6a88264d46 (diff) | |
download | sandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.tar.gz sandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.zip |
refactor ingest to a loop, allowing multiple hops
Diffstat (limited to 'python/tests')
-rw-r--r-- | python/tests/test_ingest.py | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 8692b21..f5599e9 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -109,12 +109,19 @@ def test_ingest_landing(ingest_worker): headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, body=WARC_BODY) + # this is for second time around; don't want to fetch same landing page + # HTML again and result in a loop + responses.add(responses.GET, + 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body="<html></html>") + resp = ingest_worker.process(request) print(resp) assert resp['hit'] == False - assert resp['status'] == "wrong-mimetype" + assert resp['status'] == "no-pdf-link" assert resp['request'] == request assert 'grobid' not in resp - assert resp['terminal'] |