aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-04-29 16:13:27 -0700
committerBryan Newbold <bnewbold@archive.org>2021-04-29 16:13:27 -0700
commitf65f8ffb559826fa9d2ec5fd34d630735519b5dd (patch)
treecbb1a6d32a0c2b0a5720ad776f7b15e5cc3574ca /fatcat_scholar
parent6b1f87c12f7d40a3016910b214579a368c747df4 (diff)
downloadfatcat-scholar-f65f8ffb559826fa9d2ec5fd34d630735519b5dd.tar.gz
fatcat-scholar-f65f8ffb559826fa9d2ec5fd34d630735519b5dd.zip
web: sitemap.xml serving
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/static/robots.allow.txt16
-rw-r--r--fatcat_scholar/static/sitemap.xml7
-rw-r--r--fatcat_scholar/web.py6
3 files changed, 27 insertions, 2 deletions
diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt
index 3c40454..35f13a3 100644
--- a/fatcat_scholar/static/robots.allow.txt
+++ b/fatcat_scholar/static/robots.allow.txt
@@ -1,5 +1,17 @@
-# Allow most queries, but please don't crawl search results (or at least do so very politely, eg no parallel requests)
+# Hello friends!
+# If you are considering large or automated crawling, you may want to look at
+# our catalog API (https://api.fatcat.wiki) or bulk database snapshots instead.
+# by default, can crawl anything on this domain. HTTP 429 ("backoff") status
+# codes are used for rate-limiting instead of any crawl delay specified here.
+# Up to a handful concurrent requests should be fine.
User-Agent: *
-Disallow: /search
Allow: /
+
+# crawling search result pages is expensive, so we do specify a long crawl delay for those
+User-agent: *
+Allow: /search
+Crawl-delay: 5
+
+Sitemap: /sitemap.xml
+Sitemap: /sitemap-index-works.xml
diff --git a/fatcat_scholar/static/sitemap.xml b/fatcat_scholar/static/sitemap.xml
new file mode 100644
index 0000000..a155373
--- /dev/null
+++ b/fatcat_scholar/static/sitemap.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+ <!-- basic site pages -->
+ <url><loc>https://scholar.archive.org/</loc></url>
+ <url><loc>https://scholar.archive.org/about</loc></url>
+ <url><loc>https://scholar.archive.org/help</loc></url>
+</urlset>
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index adddcbd..9963cec 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -413,6 +413,12 @@ async def favicon() -> Any:
"fatcat_scholar/static/ia-favicon.ico", media_type="image/x-icon"
)
+@app.get("/sitemap.xml", include_in_schema=False)
+async def basic_sitemap() -> Any:
+ return FileResponse(
+ "fatcat_scholar/static/sitemap.xml", media_type="application/xml"
+ )
+
ROBOTS_ALLOW = open("fatcat_scholar/static/robots.allow.txt", "r").read()
ROBOTS_DISALLOW = open("fatcat_scholar/static/robots.disallow.txt", "r").read()