aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-05-11 12:19:48 -0700
committerBryan Newbold <bnewbold@archive.org>2022-05-11 12:19:48 -0700
commit1534ff4d05c6fca460e82b5707fe3fbdc3504e50 (patch)
tree1e2fa8a3a95be62b63db26d4c1584014d4be8c8f
parenta0214959c10a5ecb794d78b189a767ac01c0af48 (diff)
downloadsandcrawler-1534ff4d05c6fca460e82b5707fe3fbdc3504e50.tar.gz
sandcrawler-1534ff4d05c6fca460e82b5707fe3fbdc3504e50.zip
ingest: skip arxiv.org DOIs, we already direct-ingest
-rw-r--r--python/sandcrawler/ingest_file.py1
1 files changed, 1 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index f7c7d78..eca8bf8 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -151,6 +151,7 @@ class IngestFileWorker(SandcrawlerWorker):
"doi.org/10.2307/", # JSTOR; slow and many redirects
"doi.org/10.18730/", # fao.org: database entry
"doi.org/10.15468/", # gbif.org: database entry
+ "doi.org/10.48550/", # arxiv.org: redundant with direct ingest
# deprecated domain (doesn't redirect correctly)
"://edoc.mpg.de/",
# bogus/spam PDFs