diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-10 15:07:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-10 15:07:19 -0700 |
commit | e9dd3c73f036d3fba2680eeaff8e62ecf2dbf9a1 (patch) | |
tree | f3ac6aab6c5ca7aafb957063a2b5c1482cded5ea /python/fatcat_tools/workers/changelog.py | |
parent | de0fb59f0e36d8079649feefb7592189d8f7c6ed (diff) | |
download | fatcat-e9dd3c73f036d3fba2680eeaff8e62ecf2dbf9a1.tar.gz fatcat-e9dd3c73f036d3fba2680eeaff8e62ecf2dbf9a1.zip |
update crawl blocklist for SPNv2 requests which mostly fail
Diffstat (limited to 'python/fatcat_tools/workers/changelog.py')
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 12 |
1 files changed, 10 insertions, 2 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index d5891ad1..1ac7a865 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -101,12 +101,20 @@ class EntityUpdatesWorker(FatcatWorker): "10.3932/", # ccdc.cam.ac.uk: crystal structures "10.5517/", + # researchgate: mostly blocks our crawler + "10.13140/", + # springerlink: mostly blocks crawler + "10.1007/", + # nature group: mostly blocks crawler + "10.1038/", + # SAGE: mostly blocks crawler + "10.1177/", + # IOP: mostly blocks crawler + "10.1088/", ] self.live_pdf_ingest_doi_prefix_acceptlist = [ # biorxiv and medrxiv "10.1101/", - # researchgate - "10.13140/", # the lancet (often hybrid OA) "10.1016/s0140-6736", "10.1016/s2213-2600", |