aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-17 12:12:05 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-17 12:22:34 -0800
commit58f744e97c8f3f1a3472aa821f4518d7d139e850 (patch)
tree704bee1fa9ae33e942a6fcb084d260df6187ed35 /python
parent246f8033ba189bdf3ddbf64b2cb851d86ec43b75 (diff)
downloadsandcrawler-58f744e97c8f3f1a3472aa821f4518d7d139e850.tar.gz
sandcrawler-58f744e97c8f3f1a3472aa821f4518d7d139e850.zip
ingest: add URL blocklist feature
And, temporarily, block zenodo and figshare.
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py36
-rw-r--r--python/tests/test_ingest.py17
2 files changed, 49 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 53c4ccf..4732509 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -59,10 +59,34 @@ class IngestFileWorker(SandcrawlerWorker):
self.pgrest_client = SandcrawlerPostgrestClient()
self.grobid_sink = kwargs.get('grobid_sink')
- self.try_existing_ingest = False
- self.try_existing_grobid = True
- self.try_wayback = True
- self.try_spn2 = True
+ self.try_existing_ingest = kwargs.get('try_existing_ingest', False)
+ self.try_existing_grobid = kwargs.get('try_existing_grobid', True)
+ self.try_wayback = kwargs.get('try_wayback', True)
+ self.try_spn2 = kwargs.get('try_spn2', True)
+
+ self.base_url_blocklist = [
+ # temporary, until we do specific crawls
+ "://doi.org/10.5281/zenodo",
+ "://doi.org/10.6084/",
+ "://doi.org/10.11583/",
+ "://doi.org/10.1184/",
+ "://zenodo.org/",
+ "://figshare.com/",
+
+ # temporary, until we implement specific fetch and 'petabox' output
+ "://archive.org/",
+ "://web.archive.org/web/",
+ "://openlibrary.org/",
+ "://fatcat.wiki/",
+
+ # Domain squats
+ "://bartandjones.com",
+ "://ijretm.com",
+ "://ijrcemas.com",
+ "://jist.net.in",
+ "://croisements-revue.org",
+
+ ]
def check_existing_ingest(self, base_url):
"""
@@ -197,6 +221,10 @@ class IngestFileWorker(SandcrawlerWorker):
ingest_type = request.get('ingest_type')
base_url = request['base_url']
+ for block in self.base_url_blocklist:
+ if block in base_url:
+ return dict(request=request, hit=False, status="skip-url-blocklist")
+
print("[INGEST {}\t] {}".format(ingest_type, base_url), file=sys.stderr)
best_mimetype = None
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 050e2ea..33de35d 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -149,3 +149,20 @@ def test_ingest_landing(ingest_worker):
assert 'revisit_cdx' not in resp
assert 'grobid' not in resp
+@responses.activate
+def test_ingest_blocklist(ingest_worker):
+
+ ingest_worker.base_url_blocklist = [
+ '://test.fatcat.wiki/',
+ ]
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "skip-url-blocklist"
+ assert resp['request'] == request
+