diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-05-11 12:19:48 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-05-11 12:19:48 -0700 |
commit | 1534ff4d05c6fca460e82b5707fe3fbdc3504e50 (patch) | |
tree | 1e2fa8a3a95be62b63db26d4c1584014d4be8c8f | |
parent | a0214959c10a5ecb794d78b189a767ac01c0af48 (diff) | |
download | sandcrawler-1534ff4d05c6fca460e82b5707fe3fbdc3504e50.tar.gz sandcrawler-1534ff4d05c6fca460e82b5707fe3fbdc3504e50.zip |
ingest: skip arxiv.org DOIs, we already direct-ingest
-rw-r--r-- | python/sandcrawler/ingest_file.py | 1 |
1 files changed, 1 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index f7c7d78..eca8bf8 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -151,6 +151,7 @@ class IngestFileWorker(SandcrawlerWorker): "doi.org/10.2307/", # JSTOR; slow and many redirects "doi.org/10.18730/", # fao.org: database entry "doi.org/10.15468/", # gbif.org: database entry + "doi.org/10.48550/", # arxiv.org: redundant with direct ingest # deprecated domain (doesn't redirect correctly) "://edoc.mpg.de/", # bogus/spam PDFs |