From c6b33542398c933a6272586e6280f7026b63a124 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 19 Aug 2020 23:00:34 -0700 Subject: update robots.txt and sitemap.xml - show minimal robots/sitemap if not in prod environment - default to allow all in robots.txt; link to sitemap index files - basic sitemap.xml without entity-level links --- python/fatcat_web/static/robots.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'python/fatcat_web/static/robots.txt') diff --git a/python/fatcat_web/static/robots.txt b/python/fatcat_web/static/robots.txt index a168f11b..e89af36e 100644 --- a/python/fatcat_web/static/robots.txt +++ b/python/fatcat_web/static/robots.txt @@ -1 +1,20 @@ # Hello friends! +# If you are considering large or automated crawling, you may want to look at +# our API (https://api.fatcat.wiki) or bulk database snapshots instead. + +# by default, can crawl anything on this domain. HTTP 429 ("backoff") status +# codes are used for rate-limiting instead of any crawl delay specified here. +# Up to a handful concurrent requests should be fine. +User-agent: * +Allow: / + +# crawling search result pages is expensive, so we do specify a long crawl delay for those +User-agent: * +Allow: /release/search +Allow: /container/search +Allow: /coverage/search +Crawl-delay: 5 + +Sitemap: https://fatcat.wiki/sitemap.xml +Sitemap: https://fatcat.wiki/sitemap-index-releases.xml +Sitemap: https://fatcat.wiki/sitemap-index-containers.xml -- cgit v1.2.3