From 52576d07393b3511474ef077171c012244b3ccd9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 11 Jan 2022 12:54:59 -0800
Subject: robots.txt: block /search for large crawlers

---
 fatcat_scholar/static/robots.allow.txt | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fatcat_scholar')

diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt
index 21a343e..ccdfda1 100644
--- a/fatcat_scholar/static/robots.allow.txt
+++ b/fatcat_scholar/static/robots.allow.txt
@@ -2,17 +2,25 @@
 # If you are considering large or automated crawling, you may want to look at
 # our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead.
 
+# large-scale bots should not index search pages
+User-agent: SemrushBot
+User-agent: YandexBot
+User-agent: bingbot
+User-agent: Googlebot
+Disallow: /search
+
+# crawling search result pages is expensive, so we do specify a long crawl
+# delay for those (for bots other than the above broad search bots)
+User-agent: *
+Allow: /search
+Crawl-delay: 5
+
 # by default, can crawl anything on this domain. HTTP 429 ("backoff") status
 # codes are used for rate-limiting instead of any crawl delay specified here.
 # Up to a handful concurrent requests should be fine.
 User-Agent: *
 Allow: /
 
-# crawling search result pages is expensive, so we do specify a long crawl delay for those
-User-agent: *
-Allow: /search
-Crawl-delay: 5
-
 Sitemap: https://scholar.archive.org/sitemap.xml
 Sitemap: https://scholar.archive.org/sitemap-index-works.xml
 
-- 
cgit v1.2.3