diff options
-rw-r--r-- | fatcat_scholar/static/robots.allow.txt | 18 |
1 files changed, 13 insertions, 5 deletions
diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt index 21a343e..ccdfda1 100644 --- a/fatcat_scholar/static/robots.allow.txt +++ b/fatcat_scholar/static/robots.allow.txt @@ -2,17 +2,25 @@ # If you are considering large or automated crawling, you may want to look at # our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead. +# large-scale bots should not index search pages +User-agent: SemrushBot +User-agent: YandexBot +User-agent: bingbot +User-agent: Googlebot +Disallow: /search + +# crawling search result pages is expensive, so we do specify a long crawl +# delay for those (for bots other than the above broad search bots) +User-agent: * +Allow: /search +Crawl-delay: 5 + # by default, can crawl anything on this domain. HTTP 429 ("backoff") status # codes are used for rate-limiting instead of any crawl delay specified here. # Up to a handful concurrent requests should be fine. User-Agent: * Allow: / -# crawling search result pages is expensive, so we do specify a long crawl delay for those -User-agent: * -Allow: /search -Crawl-delay: 5 - Sitemap: https://scholar.archive.org/sitemap.xml Sitemap: https://scholar.archive.org/sitemap-index-works.xml |