From 04cb1a01cbd1bc4f017ebd61d8b6732ea060ee44 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 22 Feb 2020 14:05:03 -0800 Subject: ingest: skip more non-pdf, non-paper domains --- python/sandcrawler/ingest.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 9f07fc3..6f9ea45 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -81,6 +81,15 @@ class IngestFileWorker(SandcrawlerWorker): # all stubs/previews, not full papers "://page-one.live.cf.public.springer.com", + # large datasets-only (no PDF expected) + "plutof.ut.ee/", + "www.gbif.org/", + "doi.pangaea.de/", + "www.plate-archive.org/", + + # Historical non-paper content: + "dhz.uni-passau.de/", # newspapers + "digital.ucd.ie/", # ireland national historical ] def check_existing_ingest(self, base_url): -- cgit v1.2.3