summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2020-08-20 21:17:59 +0000
committerbnewbold <bnewbold@archive.org>2020-08-20 21:17:59 +0000
commitdaf91b137483b7345448b597289c78f8fb3f9969 (patch)
tree712c27d902235d8d007763b512c57eaecd8045ad /python
parent5007ee299ce07b31db6d48cd4ab2587f87af53ab (diff)
parent2a98d10be1cc1368f9510745bff07c343974d4a7 (diff)
downloadfatcat-daf91b137483b7345448b597289c78f8fb3f9969.tar.gz
fatcat-daf91b137483b7345448b597289c78f8fb3f9969.zip
Merge branch 'bnewbold-sitemap' into 'master'
basic sitemap setup See merge request webgroup/fatcat!79
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_web/routes.py15
-rw-r--r--python/fatcat_web/static/robots.deny_all.txt7
-rw-r--r--python/fatcat_web/static/robots.txt19
-rw-r--r--python/fatcat_web/static/sitemap.xml13
-rw-r--r--python/fatcat_web/templates/home.html7
5 files changed, 54 insertions, 7 deletions
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index da2bb6cf..9ae2eaa9 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -1157,7 +1157,18 @@ def page_rfc():
return render_template('rfc.html')
@app.route('/robots.txt', methods=['GET'])
-def robots():
+def page_robots_txt():
+ if app.config['FATCAT_DOMAIN'] == "fatcat.wiki":
+ robots_path = "robots.txt"
+ else:
+ robots_path = "robots.deny_all.txt"
return send_from_directory(os.path.join(app.root_path, 'static'),
- 'robots.txt',
+ robots_path,
mimetype='text/plain')
+
+@app.route('/sitemap.xml', methods=['GET'])
+def page_sitemap_xml():
+ return send_from_directory(os.path.join(app.root_path, 'static'),
+ "sitemap.xml",
+ mimetype='text/xml')
+
diff --git a/python/fatcat_web/static/robots.deny_all.txt b/python/fatcat_web/static/robots.deny_all.txt
new file mode 100644
index 00000000..b88274b1
--- /dev/null
+++ b/python/fatcat_web/static/robots.deny_all.txt
@@ -0,0 +1,7 @@
+# Hello friends!
+
+# You have found a QA/development instance of the Fatcat catalog. The canonical
+# location is https://fatcat.wiki, please crawl and index that location instead.
+
+User-agent: *
+Disallow: /
diff --git a/python/fatcat_web/static/robots.txt b/python/fatcat_web/static/robots.txt
index a168f11b..e89af36e 100644
--- a/python/fatcat_web/static/robots.txt
+++ b/python/fatcat_web/static/robots.txt
@@ -1 +1,20 @@
# Hello friends!
+# If you are considering large or automated crawling, you may want to look at
+# our API (https://api.fatcat.wiki) or bulk database snapshots instead.
+
+# by default, can crawl anything on this domain. HTTP 429 ("backoff") status
+# codes are used for rate-limiting instead of any crawl delay specified here.
+# Up to a handful concurrent requests should be fine.
+User-agent: *
+Allow: /
+
+# crawling search result pages is expensive, so we do specify a long crawl delay for those
+User-agent: *
+Allow: /release/search
+Allow: /container/search
+Allow: /coverage/search
+Crawl-delay: 5
+
+Sitemap: https://fatcat.wiki/sitemap.xml
+Sitemap: https://fatcat.wiki/sitemap-index-releases.xml
+Sitemap: https://fatcat.wiki/sitemap-index-containers.xml
diff --git a/python/fatcat_web/static/sitemap.xml b/python/fatcat_web/static/sitemap.xml
new file mode 100644
index 00000000..e6189aa4
--- /dev/null
+++ b/python/fatcat_web/static/sitemap.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+ <!-- basic site pages -->
+ <url><loc>https://fatcat.wiki/</loc></url>
+ <url><loc>https://fatcat.wiki/about</loc></url>
+ <url><loc>https://fatcat.wiki/rfc</loc></url>
+ <url><loc>https://fatcat.wiki/stats</loc></url>
+ <url><loc>https://fatcat.wiki/changelog</loc></url>
+ <url><loc>https://fatcat.wiki/release/lookup</loc></url>
+ <url><loc>https://fatcat.wiki/container/lookup</loc></url>
+ <url><loc>https://fatcat.wiki/file/lookup</loc></url>
+ <!-- additional entity-level URL lists are linked from robots.txt -->
+</urlset>
diff --git a/python/fatcat_web/templates/home.html b/python/fatcat_web/templates/home.html
index de32d6a4..7ffa64ca 100644
--- a/python/fatcat_web/templates/home.html
+++ b/python/fatcat_web/templates/home.html
@@ -8,12 +8,9 @@
{% endblock %}
{% block fullmain %}
-<!--
-<div class="ui container text" itemscope itemtype="https://schema.org/WebSite">
-<meta itemprop="url" content="https://{{ config.FATCAT_DOMAIN }}/"/>
--->
-<div class ="ui vertical inverted masthead center aligned segment" style="padding-top: 12em; padding-bottom: 10em;">
+<div class ="ui vertical inverted masthead center aligned segment" style="padding-top: 12em; padding-bottom: 10em;" itemscope itemtype="https://schema.org/WebSite">
+ <link itemprop="url" content="https://{{ config.FATCAT_DOMAIN }}/"/>
<div class="ui text container">
<h1 class="ui header inverted huge centered">Perpetual Access to Millions of Open Research Publications From Around The World</h1>
<br>