From 58f744e97c8f3f1a3472aa821f4518d7d139e850 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 17 Jan 2020 12:12:05 -0800 Subject: ingest: add URL blocklist feature And, temporarily, block zenodo and figshare. --- python/sandcrawler/ingest.py | 36 ++++++++++++++++++++++++++++++++---- python/tests/test_ingest.py | 17 +++++++++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 53c4ccf..4732509 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -59,10 +59,34 @@ class IngestFileWorker(SandcrawlerWorker): self.pgrest_client = SandcrawlerPostgrestClient() self.grobid_sink = kwargs.get('grobid_sink') - self.try_existing_ingest = False - self.try_existing_grobid = True - self.try_wayback = True - self.try_spn2 = True + self.try_existing_ingest = kwargs.get('try_existing_ingest', False) + self.try_existing_grobid = kwargs.get('try_existing_grobid', True) + self.try_wayback = kwargs.get('try_wayback', True) + self.try_spn2 = kwargs.get('try_spn2', True) + + self.base_url_blocklist = [ + # temporary, until we do specific crawls + "://doi.org/10.5281/zenodo", + "://doi.org/10.6084/", + "://doi.org/10.11583/", + "://doi.org/10.1184/", + "://zenodo.org/", + "://figshare.com/", + + # temporary, until we implement specific fetch and 'petabox' output + "://archive.org/", + "://web.archive.org/web/", + "://openlibrary.org/", + "://fatcat.wiki/", + + # Domain squats + "://bartandjones.com", + "://ijretm.com", + "://ijrcemas.com", + "://jist.net.in", + "://croisements-revue.org", + + ] def check_existing_ingest(self, base_url): """ @@ -197,6 +221,10 @@ class IngestFileWorker(SandcrawlerWorker): ingest_type = request.get('ingest_type') base_url = request['base_url'] + for block in self.base_url_blocklist: + if block in base_url: + return dict(request=request, hit=False, status="skip-url-blocklist") + print("[INGEST {}\t] {}".format(ingest_type, base_url), file=sys.stderr) best_mimetype = None diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 050e2ea..33de35d 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -149,3 +149,20 @@ def test_ingest_landing(ingest_worker): assert 'revisit_cdx' not in resp assert 'grobid' not in resp +@responses.activate +def test_ingest_blocklist(ingest_worker): + + ingest_worker.base_url_blocklist = [ + '://test.fatcat.wiki/', + ] + request = { + 'ingest_type': 'pdf', + 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + } + + resp = ingest_worker.process(request) + + assert resp['hit'] == False + assert resp['status'] == "skip-url-blocklist" + assert resp['request'] == request + -- cgit v1.2.3