robots.txt: block /search for large crawlers

author: Bryan Newbold <bnewbold@archive.org> 2022-01-11 12:54:59 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2022-01-11 12:54:59 -0800
commit: 52576d07393b3511474ef077171c012244b3ccd9 (patch)
tree: 481102a25974118b43f5693da975da1cff8ef614 /fatcat_scholar/static/robots.allow.txt
parent: 3d830aa01867e1324d1c85ba681cf799d98605bc (diff)
download: fatcat-scholar-52576d07393b3511474ef077171c012244b3ccd9.tar.gz
fatcat-scholar-52576d07393b3511474ef077171c012244b3ccd9.zip
1 files changed, 13 insertions, 5 deletions
diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt
index 21a343e..ccdfda1 100644
--- a/fatcat_scholar/static/robots.allow.txt
+++ b/fatcat_scholar/static/robots.allow.txt
@@ -2,17 +2,25 @@
 # If you are considering large or automated crawling, you may want to look at
 # our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead.
 
+# large-scale bots should not index search pages
+User-agent: SemrushBot
+User-agent: YandexBot
+User-agent: bingbot
+User-agent: Googlebot
+Disallow: /search
+
+# crawling search result pages is expensive, so we do specify a long crawl
+# delay for those (for bots other than the above broad search bots)
+User-agent: *
+Allow: /search
+Crawl-delay: 5
+
 # by default, can crawl anything on this domain. HTTP 429 ("backoff") status
 # codes are used for rate-limiting instead of any crawl delay specified here.
 # Up to a handful concurrent requests should be fine.
 User-Agent: *
 Allow: /
 
-# crawling search result pages is expensive, so we do specify a long crawl delay for those
-User-agent: *
-Allow: /search
-Crawl-delay: 5
-
 Sitemap: https://scholar.archive.org/sitemap.xml
 Sitemap: https://scholar.archive.org/sitemap-index-works.xml
author	Bryan Newbold <bnewbold@archive.org>	2022-01-11 12:54:59 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2022-01-11 12:54:59 -0800
commit	52576d07393b3511474ef077171c012244b3ccd9 (patch)
tree	481102a25974118b43f5693da975da1cff8ef614 /fatcat_scholar/static/robots.allow.txt
parent	3d830aa01867e1324d1c85ba681cf799d98605bc (diff)
download	fatcat-scholar-52576d07393b3511474ef077171c012244b3ccd9.tar.gz fatcat-scholar-52576d07393b3511474ef077171c012244b3ccd9.zip