diff options
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/static/robots.allow.txt | 16 | ||||
-rw-r--r-- | fatcat_scholar/static/sitemap.xml | 7 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 6 |
3 files changed, 27 insertions, 2 deletions
diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt index 3c40454..35f13a3 100644 --- a/fatcat_scholar/static/robots.allow.txt +++ b/fatcat_scholar/static/robots.allow.txt @@ -1,5 +1,17 @@ -# Allow most queries, but please don't crawl search results (or at least do so very politely, eg no parallel requests) +# Hello friends! +# If you are considering large or automated crawling, you may want to look at +# our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead. +# by default, can crawl anything on this domain. HTTP 429 ("backoff") status +# codes are used for rate-limiting instead of any crawl delay specified here. +# Up to a handful concurrent requests should be fine. User-Agent: * -Disallow: /search Allow: / + +# crawling search result pages is expensive, so we do specify a long crawl delay for those +User-agent: * +Allow: /search +Crawl-delay: 5 + +Sitemap: /sitemap.xml +Sitemap: /sitemap-index-works.xml diff --git a/fatcat_scholar/static/sitemap.xml b/fatcat_scholar/static/sitemap.xml new file mode 100644 index 0000000..a155373 --- /dev/null +++ b/fatcat_scholar/static/sitemap.xml @@ -0,0 +1,7 @@ +<?xml version="1.0" encoding="UTF-8"?> +<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + <!-- basic site pages --> + <url><loc>https://scholar.archive.org/</loc></url> + <url><loc>https://scholar.archive.org/about</loc></url> + <url><loc>https://scholar.archive.org/help</loc></url> +</urlset> diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index adddcbd..9963cec 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -413,6 +413,12 @@ async def favicon() -> Any: "fatcat_scholar/static/ia-favicon.ico", media_type="image/x-icon" ) +@app.get("/sitemap.xml", include_in_schema=False) +async def basic_sitemap() -> Any: + return FileResponse( + "fatcat_scholar/static/sitemap.xml", media_type="application/xml" + ) + ROBOTS_ALLOW = open("fatcat_scholar/static/robots.allow.txt", "r").read() ROBOTS_DISALLOW = open("fatcat_scholar/static/robots.disallow.txt", "r").read() |