From 52576d07393b3511474ef077171c012244b3ccd9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 11 Jan 2022 12:54:59 -0800 Subject: robots.txt: block /search for large crawlers --- fatcat_scholar/static/robots.allow.txt | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt index 21a343e..ccdfda1 100644 --- a/fatcat_scholar/static/robots.allow.txt +++ b/fatcat_scholar/static/robots.allow.txt @@ -2,17 +2,25 @@ # If you are considering large or automated crawling, you may want to look at # our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead. +# large-scale bots should not index search pages +User-agent: SemrushBot +User-agent: YandexBot +User-agent: bingbot +User-agent: Googlebot +Disallow: /search + +# crawling search result pages is expensive, so we do specify a long crawl +# delay for those (for bots other than the above broad search bots) +User-agent: * +Allow: /search +Crawl-delay: 5 + # by default, can crawl anything on this domain. HTTP 429 ("backoff") status # codes are used for rate-limiting instead of any crawl delay specified here. # Up to a handful concurrent requests should be fine. User-Agent: * Allow: / -# crawling search result pages is expensive, so we do specify a long crawl delay for those -User-agent: * -Allow: /search -Crawl-delay: 5 - Sitemap: https://scholar.archive.org/sitemap.xml Sitemap: https://scholar.archive.org/sitemap-index-works.xml -- cgit v1.2.3