robots: more blocks to reduce load on search cluster

author: Bryan Newbold <bnewbold@archive.org> 2022-10-31 10:51:44 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-10-31 10:51:44 -0700
commit: 458b8010dc76eb25e24b1ed6626bfb34bc2d0668 (patch)
tree: ad3b30d610d29dacedeb6378e1ea37d94649c435 /fatcat_scholar
parent: 9da8c9bdb9149f3701f8774c1a220de889e7cc59 (diff)
download: fatcat-scholar-458b8010dc76eb25e24b1ed6626bfb34bc2d0668.tar.gz
fatcat-scholar-458b8010dc76eb25e24b1ed6626bfb34bc2d0668.zip
1 files changed, 8 insertions, 2 deletions
diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt
index 6076e75..9f98785 100644
--- a/fatcat_scholar/static/robots.allow.txt
+++ b/fatcat_scholar/static/robots.allow.txt
@@ -9,13 +9,19 @@ User-agent: bingbot
 User-agent: Googlebot
 User-agent: SemanticScholarBot
 User-agent: yacybot
+User-agent: PetalBot
+User-agent: Yeti
+User-agent: Riddler
 Disallow: /search
 
 # crawling search result pages is expensive, so we do specify a long crawl
 # delay for those (for bots other than the above broad search bots)
+# UPDATE: actually, just block all robots from search page, we are overwhelmed
+# as of 2022-10-31
 User-agent: *
-Allow: /search
-Crawl-delay: 5
+Disallow: /search
+#Allow: /search
+#Crawl-delay: 5
 
 # by default, can crawl anything on this domain. HTTP 429 ("backoff") status
 # codes are used for rate-limiting instead of any crawl delay specified here.
author	Bryan Newbold <bnewbold@archive.org>	2022-10-31 10:51:44 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-10-31 10:51:44 -0700
commit	458b8010dc76eb25e24b1ed6626bfb34bc2d0668 (patch)
tree	ad3b30d610d29dacedeb6378e1ea37d94649c435 /fatcat_scholar
parent	9da8c9bdb9149f3701f8774c1a220de889e7cc59 (diff)
download	fatcat-scholar-458b8010dc76eb25e24b1ed6626bfb34bc2d0668.tar.gz fatcat-scholar-458b8010dc76eb25e24b1ed6626bfb34bc2d0668.zip