From c6b33542398c933a6272586e6280f7026b63a124 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 19 Aug 2020 23:00:34 -0700 Subject: update robots.txt and sitemap.xml - show minimal robots/sitemap if not in prod environment - default to allow all in robots.txt; link to sitemap index files - basic sitemap.xml without entity-level links --- python/fatcat_web/routes.py | 15 +++++++++++++-- python/fatcat_web/static/robots.deny_all.txt | 7 +++++++ python/fatcat_web/static/robots.txt | 19 +++++++++++++++++++ python/fatcat_web/static/sitemap.xml | 13 +++++++++++++ 4 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 python/fatcat_web/static/robots.deny_all.txt create mode 100644 python/fatcat_web/static/sitemap.xml (limited to 'python/fatcat_web') diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index da2bb6cf..c66c7f0c 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -1157,7 +1157,18 @@ def page_rfc(): return render_template('rfc.html') @app.route('/robots.txt', methods=['GET']) -def robots(): +def page_robots_txt(): + if conf.FATCAT_DOMAIN == "fatcat.wiki": + robots_path = "robots.txt" + else: + robots_path = "robots.deny_all.txt" return send_from_directory(os.path.join(app.root_path, 'static'), - 'robots.txt', + robots_path, mimetype='text/plain') + +@app.route('/sitemap.xml', methods=['GET']) +def page_sitemap_xml(): + if conf.FATCAT_DOMAIN == "fatcat.wiki": + return redirect('/sitemaps/sitemap.xml') + else: + abort(404) diff --git a/python/fatcat_web/static/robots.deny_all.txt b/python/fatcat_web/static/robots.deny_all.txt new file mode 100644 index 00000000..b88274b1 --- /dev/null +++ b/python/fatcat_web/static/robots.deny_all.txt @@ -0,0 +1,7 @@ +# Hello friends! + +# You have found a QA/development instance of the Fatcat catalog. The canonical +# location is https://fatcat.wiki, please crawl and index that location instead. + +User-agent: * +Disallow: / diff --git a/python/fatcat_web/static/robots.txt b/python/fatcat_web/static/robots.txt index a168f11b..e89af36e 100644 --- a/python/fatcat_web/static/robots.txt +++ b/python/fatcat_web/static/robots.txt @@ -1 +1,20 @@ # Hello friends! +# If you are considering large or automated crawling, you may want to look at +# our API (https://api.fatcat.wiki) or bulk database snapshots instead. + +# by default, can crawl anything on this domain. HTTP 429 ("backoff") status +# codes are used for rate-limiting instead of any crawl delay specified here. +# Up to a handful concurrent requests should be fine. +User-agent: * +Allow: / + +# crawling search result pages is expensive, so we do specify a long crawl delay for those +User-agent: * +Allow: /release/search +Allow: /container/search +Allow: /coverage/search +Crawl-delay: 5 + +Sitemap: https://fatcat.wiki/sitemap.xml +Sitemap: https://fatcat.wiki/sitemap-index-releases.xml +Sitemap: https://fatcat.wiki/sitemap-index-containers.xml diff --git a/python/fatcat_web/static/sitemap.xml b/python/fatcat_web/static/sitemap.xml new file mode 100644 index 00000000..e6189aa4 --- /dev/null +++ b/python/fatcat_web/static/sitemap.xml @@ -0,0 +1,13 @@ + + + + https://fatcat.wiki/ + https://fatcat.wiki/about + https://fatcat.wiki/rfc + https://fatcat.wiki/stats + https://fatcat.wiki/changelog + https://fatcat.wiki/release/lookup + https://fatcat.wiki/container/lookup + https://fatcat.wiki/file/lookup + + -- cgit v1.2.3 From c15cbf3568f7d91774e1cb82a39474c0ff874616 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 19 Aug 2020 23:57:31 -0700 Subject: sitemap fixes from testing --- extra/sitemap/README.md | 11 +++++++++++ extra/sitemap/generate_sitemap_indices.py | 2 +- extra/sitemap/release_url_lists.sh | 6 +++--- python/fatcat_web/routes.py | 10 +++++----- 4 files changed, 20 insertions(+), 9 deletions(-) (limited to 'python/fatcat_web') diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md index 735ac925..f72893cd 100644 --- a/extra/sitemap/README.md +++ b/extra/sitemap/README.md @@ -1,4 +1,15 @@ +## HOWTO: Update + +After a container dump, as `fatcat` user on prod server: + + cd /srv/fatcat/sitemap + export DATE=`date --iso-8601` # or whatever + /srv/fatcat/src/extra/sitemap/container_url_lists.sh $DATE /srv/fatcat/snapshots/container_export.json.gz + /srv/fatcat/src/extra/sitemap/release_url_lists.sh $DATE /srv/fatcat/snapshots/release_export_expanded.json.gz + # delete old sitemap url lists + /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py + ## Background Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py index 9766ac1f..0a5624a1 100755 --- a/extra/sitemap/generate_sitemap_indices.py +++ b/extra/sitemap/generate_sitemap_indices.py @@ -6,7 +6,7 @@ import datetime def index_entity(entity_type, output): - now = datetime.datetime.now().isoformat() + now = datetime.date.today().isoformat() print("""""", file=output) print("""""", file=output) diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh index 4190011f..d5c8d4ef 100755 --- a/extra/sitemap/release_url_lists.sh +++ b/extra/sitemap/release_url_lists.sh @@ -6,14 +6,14 @@ set -o pipefail # fail if part of a '|' command fails : ${1?' You you did not supply a date argument'} : ${2?' You you did not supply an input file (JSON gzip)'} -if [ -f $2 ] ; then +if [ ! -f $2 ] ; then echo "Input file not found: $2" && exit 1; fi # eg, 2020-08-19 -DATE = "$1" +DATE="$1" # eg, release_export_expanded.json.gz -EXPORT_FILE_GZ = "$2" +EXPORT_FILE_GZ="$2" # filter to fulltext releases only, then filter to only one hit per work zcat $EXPORT_FILE_GZ \ diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index c66c7f0c..9ae2eaa9 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -1158,7 +1158,7 @@ def page_rfc(): @app.route('/robots.txt', methods=['GET']) def page_robots_txt(): - if conf.FATCAT_DOMAIN == "fatcat.wiki": + if app.config['FATCAT_DOMAIN'] == "fatcat.wiki": robots_path = "robots.txt" else: robots_path = "robots.deny_all.txt" @@ -1168,7 +1168,7 @@ def page_robots_txt(): @app.route('/sitemap.xml', methods=['GET']) def page_sitemap_xml(): - if conf.FATCAT_DOMAIN == "fatcat.wiki": - return redirect('/sitemaps/sitemap.xml') - else: - abort(404) + return send_from_directory(os.path.join(app.root_path, 'static'), + "sitemap.xml", + mimetype='text/xml') + -- cgit v1.2.3 From 2a98d10be1cc1368f9510745bff07c343974d4a7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 20 Aug 2020 00:13:21 -0700 Subject: fix SearchAction nesting in WebSite (schema.org) This is not related to sitemap changes, but I was reminded in google search tools when validating site. --- python/fatcat_web/templates/home.html | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'python/fatcat_web') diff --git a/python/fatcat_web/templates/home.html b/python/fatcat_web/templates/home.html index de32d6a4..7ffa64ca 100644 --- a/python/fatcat_web/templates/home.html +++ b/python/fatcat_web/templates/home.html @@ -8,12 +8,9 @@ {% endblock %} {% block fullmain %} - -
+
+

Perpetual Access to Millions of Open Research Publications From Around The World


-- cgit v1.2.3