diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_web/routes.py | 15 | ||||
-rw-r--r-- | python/fatcat_web/static/robots.deny_all.txt | 7 | ||||
-rw-r--r-- | python/fatcat_web/static/robots.txt | 19 | ||||
-rw-r--r-- | python/fatcat_web/static/sitemap.xml | 13 | ||||
-rw-r--r-- | python/fatcat_web/templates/home.html | 7 |
5 files changed, 54 insertions, 7 deletions
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index da2bb6cf..9ae2eaa9 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -1157,7 +1157,18 @@ def page_rfc(): return render_template('rfc.html') @app.route('/robots.txt', methods=['GET']) -def robots(): +def page_robots_txt(): + if app.config['FATCAT_DOMAIN'] == "fatcat.wiki": + robots_path = "robots.txt" + else: + robots_path = "robots.deny_all.txt" return send_from_directory(os.path.join(app.root_path, 'static'), - 'robots.txt', + robots_path, mimetype='text/plain') + +@app.route('/sitemap.xml', methods=['GET']) +def page_sitemap_xml(): + return send_from_directory(os.path.join(app.root_path, 'static'), + "sitemap.xml", + mimetype='text/xml') + diff --git a/python/fatcat_web/static/robots.deny_all.txt b/python/fatcat_web/static/robots.deny_all.txt new file mode 100644 index 00000000..b88274b1 --- /dev/null +++ b/python/fatcat_web/static/robots.deny_all.txt @@ -0,0 +1,7 @@ +# Hello friends! + +# You have found a QA/development instance of the Fatcat catalog. The canonical +# location is https://fatcat.wiki, please crawl and index that location instead. + +User-agent: * +Disallow: / diff --git a/python/fatcat_web/static/robots.txt b/python/fatcat_web/static/robots.txt index a168f11b..e89af36e 100644 --- a/python/fatcat_web/static/robots.txt +++ b/python/fatcat_web/static/robots.txt @@ -1 +1,20 @@ # Hello friends! +# If you are considering large or automated crawling, you may want to look at +# our API (https://api.fatcat.wiki) or bulk database snapshots instead. + +# by default, can crawl anything on this domain. HTTP 429 ("backoff") status +# codes are used for rate-limiting instead of any crawl delay specified here. +# Up to a handful concurrent requests should be fine. +User-agent: * +Allow: / + +# crawling search result pages is expensive, so we do specify a long crawl delay for those +User-agent: * +Allow: /release/search +Allow: /container/search +Allow: /coverage/search +Crawl-delay: 5 + +Sitemap: https://fatcat.wiki/sitemap.xml +Sitemap: https://fatcat.wiki/sitemap-index-releases.xml +Sitemap: https://fatcat.wiki/sitemap-index-containers.xml diff --git a/python/fatcat_web/static/sitemap.xml b/python/fatcat_web/static/sitemap.xml new file mode 100644 index 00000000..e6189aa4 --- /dev/null +++ b/python/fatcat_web/static/sitemap.xml @@ -0,0 +1,13 @@ +<?xml version="1.0" encoding="UTF-8"?> +<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + <!-- basic site pages --> + <url><loc>https://fatcat.wiki/</loc></url> + <url><loc>https://fatcat.wiki/about</loc></url> + <url><loc>https://fatcat.wiki/rfc</loc></url> + <url><loc>https://fatcat.wiki/stats</loc></url> + <url><loc>https://fatcat.wiki/changelog</loc></url> + <url><loc>https://fatcat.wiki/release/lookup</loc></url> + <url><loc>https://fatcat.wiki/container/lookup</loc></url> + <url><loc>https://fatcat.wiki/file/lookup</loc></url> + <!-- additional entity-level URL lists are linked from robots.txt --> +</urlset> diff --git a/python/fatcat_web/templates/home.html b/python/fatcat_web/templates/home.html index de32d6a4..7ffa64ca 100644 --- a/python/fatcat_web/templates/home.html +++ b/python/fatcat_web/templates/home.html @@ -8,12 +8,9 @@ {% endblock %} {% block fullmain %} -<!-- -<div class="ui container text" itemscope itemtype="https://schema.org/WebSite"> -<meta itemprop="url" content="https://{{ config.FATCAT_DOMAIN }}/"/> ---> -<div class ="ui vertical inverted masthead center aligned segment" style="padding-top: 12em; padding-bottom: 10em;"> +<div class ="ui vertical inverted masthead center aligned segment" style="padding-top: 12em; padding-bottom: 10em;" itemscope itemtype="https://schema.org/WebSite"> + <link itemprop="url" content="https://{{ config.FATCAT_DOMAIN }}/"/> <div class="ui text container"> <h1 class="ui header inverted huge centered">Perpetual Access to Millions of Open Research Publications From Around The World</h1> <br> |