From e9dd3c73f036d3fba2680eeaff8e62ecf2dbf9a1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 10 Aug 2020 15:07:19 -0700 Subject: update crawl blocklist for SPNv2 requests which mostly fail --- python/fatcat_tools/workers/changelog.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index d5891ad1..1ac7a865 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -101,12 +101,20 @@ class EntityUpdatesWorker(FatcatWorker): "10.3932/", # ccdc.cam.ac.uk: crystal structures "10.5517/", + # researchgate: mostly blocks our crawler + "10.13140/", + # springerlink: mostly blocks crawler + "10.1007/", + # nature group: mostly blocks crawler + "10.1038/", + # SAGE: mostly blocks crawler + "10.1177/", + # IOP: mostly blocks crawler + "10.1088/", ] self.live_pdf_ingest_doi_prefix_acceptlist = [ # biorxiv and medrxiv "10.1101/", - # researchgate - "10.13140/", # the lancet (often hybrid OA) "10.1016/s0140-6736", "10.1016/s2213-2600", -- cgit v1.2.3