aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/static/robots.allow.txt
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/static/robots.allow.txt')
-rw-r--r--fatcat_scholar/static/robots.allow.txt18
1 files changed, 13 insertions, 5 deletions
diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt
index 21a343e..ccdfda1 100644
--- a/fatcat_scholar/static/robots.allow.txt
+++ b/fatcat_scholar/static/robots.allow.txt
@@ -2,17 +2,25 @@
# If you are considering large or automated crawling, you may want to look at
# our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead.
+# large-scale bots should not index search pages
+User-agent: SemrushBot
+User-agent: YandexBot
+User-agent: bingbot
+User-agent: Googlebot
+Disallow: /search
+
+# crawling search result pages is expensive, so we do specify a long crawl
+# delay for those (for bots other than the above broad search bots)
+User-agent: *
+Allow: /search
+Crawl-delay: 5
+
# by default, can crawl anything on this domain. HTTP 429 ("backoff") status
# codes are used for rate-limiting instead of any crawl delay specified here.
# Up to a handful concurrent requests should be fine.
User-Agent: *
Allow: /
-# crawling search result pages is expensive, so we do specify a long crawl delay for those
-User-agent: *
-Allow: /search
-Crawl-delay: 5
-
Sitemap: https://scholar.archive.org/sitemap.xml
Sitemap: https://scholar.archive.org/sitemap-index-works.xml