diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-09-08 14:10:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-09-08 14:10:55 -0700 |
commit | 047cfcebd8ff792d6da9c10b0bee27ec689a4e9f (patch) | |
tree | 8292e5ef65d0f9bc392eff8ffdef2d755ace7ded | |
parent | ce25d59845083ca0beab98144b0c43bfc4254d6d (diff) | |
download | sandcrawler-047cfcebd8ff792d6da9c10b0bee27ec689a4e9f.tar.gz sandcrawler-047cfcebd8ff792d6da9c10b0bee27ec689a4e9f.zip |
ingest: more block patterns, for huge databases
-rw-r--r-- | python/sandcrawler/ingest.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 630c477..1d33b94 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -120,13 +120,16 @@ class IngestFileWorker(SandcrawlerWorker): "www.plate-archive.org/", "://doi.org/10.25642/ipk/gbis/", "://apex.ipk-gatersleben.de/", + "fao.org/glis/", # Historical non-paper content: "dhz.uni-passau.de/", # newspapers "digital.ucd.ie/", # ireland national historical # DOI prefixes - "://doi.org/10.2307/", # JSTOR; slow and many redirects + "doi.org/10.2307/", # JSTOR; slow and many redirects + "doi.org/10.18730/", # fao.org: database entry + "doi.org/10.15468/", # gbif.org: database entry # deprecated domain (doesn't redirect correctly) "://edoc.mpg.de/", |