aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-22 14:05:03 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-22 14:05:03 -0800
commit04cb1a01cbd1bc4f017ebd61d8b6732ea060ee44 (patch)
tree1a7b906c6d88355bc20759196dea6ca915da598a
parentd08aac7381a392cecfe8931821df5e149b58f32a (diff)
downloadsandcrawler-04cb1a01cbd1bc4f017ebd61d8b6732ea060ee44.tar.gz
sandcrawler-04cb1a01cbd1bc4f017ebd61d8b6732ea060ee44.zip
ingest: skip more non-pdf, non-paper domains
-rw-r--r--python/sandcrawler/ingest.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 9f07fc3..6f9ea45 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -81,6 +81,15 @@ class IngestFileWorker(SandcrawlerWorker):
# all stubs/previews, not full papers
"://page-one.live.cf.public.springer.com",
+ # large datasets-only (no PDF expected)
+ "plutof.ut.ee/",
+ "www.gbif.org/",
+ "doi.pangaea.de/",
+ "www.plate-archive.org/",
+
+ # Historical non-paper content:
+ "dhz.uni-passau.de/", # newspapers
+ "digital.ucd.ie/", # ireland national historical
]
def check_existing_ingest(self, base_url):