From f65f8ffb559826fa9d2ec5fd34d630735519b5dd Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 29 Apr 2021 16:13:27 -0700 Subject: web: sitemap.xml serving --- fatcat_scholar/static/robots.allow.txt | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fatcat_scholar/static/robots.allow.txt') diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt index 3c40454..35f13a3 100644 --- a/fatcat_scholar/static/robots.allow.txt +++ b/fatcat_scholar/static/robots.allow.txt @@ -1,5 +1,17 @@ -# Allow most queries, but please don't crawl search results (or at least do so very politely, eg no parallel requests) +# Hello friends! +# If you are considering large or automated crawling, you may want to look at +# our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead. +# by default, can crawl anything on this domain. HTTP 429 ("backoff") status +# codes are used for rate-limiting instead of any crawl delay specified here. +# Up to a handful concurrent requests should be fine. User-Agent: * -Disallow: /search Allow: / + +# crawling search result pages is expensive, so we do specify a long crawl delay for those +User-agent: * +Allow: /search +Crawl-delay: 5 + +Sitemap: /sitemap.xml +Sitemap: /sitemap-index-works.xml -- cgit v1.2.3