aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-09-08 14:10:55 -0700
committerBryan Newbold <bnewbold@archive.org>2021-09-08 14:10:55 -0700
commit047cfcebd8ff792d6da9c10b0bee27ec689a4e9f (patch)
tree8292e5ef65d0f9bc392eff8ffdef2d755ace7ded
parentce25d59845083ca0beab98144b0c43bfc4254d6d (diff)
downloadsandcrawler-047cfcebd8ff792d6da9c10b0bee27ec689a4e9f.tar.gz
sandcrawler-047cfcebd8ff792d6da9c10b0bee27ec689a4e9f.zip
ingest: more block patterns, for huge databases
-rw-r--r--python/sandcrawler/ingest.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 630c477..1d33b94 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -120,13 +120,16 @@ class IngestFileWorker(SandcrawlerWorker):
"www.plate-archive.org/",
"://doi.org/10.25642/ipk/gbis/",
"://apex.ipk-gatersleben.de/",
+ "fao.org/glis/",
# Historical non-paper content:
"dhz.uni-passau.de/", # newspapers
"digital.ucd.ie/", # ireland national historical
# DOI prefixes
- "://doi.org/10.2307/", # JSTOR; slow and many redirects
+ "doi.org/10.2307/", # JSTOR; slow and many redirects
+ "doi.org/10.18730/", # fao.org: database entry
+ "doi.org/10.15468/", # gbif.org: database entry
# deprecated domain (doesn't redirect correctly)
"://edoc.mpg.de/",