aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/ingest.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 9f07fc3..6f9ea45 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -81,6 +81,15 @@ class IngestFileWorker(SandcrawlerWorker):
# all stubs/previews, not full papers
"://page-one.live.cf.public.springer.com",
+ # large datasets-only (no PDF expected)
+ "plutof.ut.ee/",
+ "www.gbif.org/",
+ "doi.pangaea.de/",
+ "www.plate-archive.org/",
+
+ # Historical non-paper content:
+ "dhz.uni-passau.de/", # newspapers
+ "digital.ucd.ie/", # ireland national historical
]
def check_existing_ingest(self, base_url):