aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-01-11 12:54:59 -0800
committerBryan Newbold <bnewbold@archive.org>2022-01-11 12:54:59 -0800
commit52576d07393b3511474ef077171c012244b3ccd9 (patch)
tree481102a25974118b43f5693da975da1cff8ef614
parent3d830aa01867e1324d1c85ba681cf799d98605bc (diff)
downloadfatcat-scholar-52576d07393b3511474ef077171c012244b3ccd9.tar.gz
fatcat-scholar-52576d07393b3511474ef077171c012244b3ccd9.zip
robots.txt: block /search for large crawlers
-rw-r--r--fatcat_scholar/static/robots.allow.txt18
1 files changed, 13 insertions, 5 deletions
diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt
index 21a343e..ccdfda1 100644
--- a/fatcat_scholar/static/robots.allow.txt
+++ b/fatcat_scholar/static/robots.allow.txt
@@ -2,17 +2,25 @@
# If you are considering large or automated crawling, you may want to look at
# our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead.
+# large-scale bots should not index search pages
+User-agent: SemrushBot
+User-agent: YandexBot
+User-agent: bingbot
+User-agent: Googlebot
+Disallow: /search
+
+# crawling search result pages is expensive, so we do specify a long crawl
+# delay for those (for bots other than the above broad search bots)
+User-agent: *
+Allow: /search
+Crawl-delay: 5
+
# by default, can crawl anything on this domain. HTTP 429 ("backoff") status
# codes are used for rate-limiting instead of any crawl delay specified here.
# Up to a handful concurrent requests should be fine.
User-Agent: *
Allow: /
-# crawling search result pages is expensive, so we do specify a long crawl delay for those
-User-agent: *
-Allow: /search
-Crawl-delay: 5
-
Sitemap: https://scholar.archive.org/sitemap.xml
Sitemap: https://scholar.archive.org/sitemap-index-works.xml