diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-17 12:12:05 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-17 12:22:34 -0800 |
commit | 58f744e97c8f3f1a3472aa821f4518d7d139e850 (patch) | |
tree | 704bee1fa9ae33e942a6fcb084d260df6187ed35 /python/tests/test_ingest.py | |
parent | 246f8033ba189bdf3ddbf64b2cb851d86ec43b75 (diff) | |
download | sandcrawler-58f744e97c8f3f1a3472aa821f4518d7d139e850.tar.gz sandcrawler-58f744e97c8f3f1a3472aa821f4518d7d139e850.zip |
ingest: add URL blocklist feature
And, temporarily, block zenodo and figshare.
Diffstat (limited to 'python/tests/test_ingest.py')
-rw-r--r-- | python/tests/test_ingest.py | 17 |
1 files changed, 17 insertions, 0 deletions
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 050e2ea..33de35d 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -149,3 +149,20 @@ def test_ingest_landing(ingest_worker): assert 'revisit_cdx' not in resp assert 'grobid' not in resp +@responses.activate +def test_ingest_blocklist(ingest_worker): + + ingest_worker.base_url_blocklist = [ + '://test.fatcat.wiki/', + ] + request = { + 'ingest_type': 'pdf', + 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + } + + resp = ingest_worker.process(request) + + assert resp['hit'] == False + assert resp['status'] == "skip-url-blocklist" + assert resp['request'] == request + |