From 1534ff4d05c6fca460e82b5707fe3fbdc3504e50 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 11 May 2022 12:19:48 -0700 Subject: ingest: skip arxiv.org DOIs, we already direct-ingest --- python/sandcrawler/ingest_file.py | 1 + 1 file changed, 1 insertion(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index f7c7d78..eca8bf8 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -151,6 +151,7 @@ class IngestFileWorker(SandcrawlerWorker): "doi.org/10.2307/", # JSTOR; slow and many redirects "doi.org/10.18730/", # fao.org: database entry "doi.org/10.15468/", # gbif.org: database entry + "doi.org/10.48550/", # arxiv.org: redundant with direct ingest # deprecated domain (doesn't redirect correctly) "://edoc.mpg.de/", # bogus/spam PDFs -- cgit v1.2.3