From f65f8ffb559826fa9d2ec5fd34d630735519b5dd Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Thu, 29 Apr 2021 16:13:27 -0700
Subject: web: sitemap.xml serving

---
 fatcat_scholar/static/robots.allow.txt | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fatcat_scholar/static/robots.allow.txt')

diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt
index 3c40454..35f13a3 100644
--- a/fatcat_scholar/static/robots.allow.txt
+++ b/fatcat_scholar/static/robots.allow.txt
@@ -1,5 +1,17 @@
-# Allow most queries, but please don't crawl search results (or at least do so very politely, eg no parallel requests)
+# Hello friends!
+# If you are considering large or automated crawling, you may want to look at
+# our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead.
 
+# by default, can crawl anything on this domain. HTTP 429 ("backoff") status
+# codes are used for rate-limiting instead of any crawl delay specified here.
+# Up to a handful concurrent requests should be fine.
 User-Agent: *
-Disallow: /search
 Allow: /
+
+# crawling search result pages is expensive, so we do specify a long crawl delay for those
+User-agent: *
+Allow: /search
+Crawl-delay: 5
+
+Sitemap: /sitemap.xml
+Sitemap: /sitemap-index-works.xml
-- 
cgit v1.2.3