aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-17 12:12:05 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-17 12:22:34 -0800
commit58f744e97c8f3f1a3472aa821f4518d7d139e850 (patch)
tree704bee1fa9ae33e942a6fcb084d260df6187ed35 /python/tests
parent246f8033ba189bdf3ddbf64b2cb851d86ec43b75 (diff)
downloadsandcrawler-58f744e97c8f3f1a3472aa821f4518d7d139e850.tar.gz
sandcrawler-58f744e97c8f3f1a3472aa821f4518d7d139e850.zip
ingest: add URL blocklist feature
And, temporarily, block zenodo and figshare.
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/test_ingest.py17
1 files changed, 17 insertions, 0 deletions
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 050e2ea..33de35d 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -149,3 +149,20 @@ def test_ingest_landing(ingest_worker):
assert 'revisit_cdx' not in resp
assert 'grobid' not in resp
+@responses.activate
+def test_ingest_blocklist(ingest_worker):
+
+ ingest_worker.base_url_blocklist = [
+ '://test.fatcat.wiki/',
+ ]
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "skip-url-blocklist"
+ assert resp['request'] == request
+