From 88a99387e09c7c43803129e72215ef3f6b4cafc6 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 7 May 2019 17:30:10 -0700
Subject: initial sitemap.xml notes/template

---
 extra/sitemap/README.md   | 23 +++++++++++++++++++++++
 extra/sitemap/sitemap.xml |  6 ++++++
 2 files changed, 29 insertions(+)
 create mode 100644 extra/sitemap/README.md
 create mode 100644 extra/sitemap/sitemap.xml
diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md
new file mode 100644
index 00000000..6963bb1f
--- /dev/null
+++ b/extra/sitemap/README.md
@@ -0,0 +1,23 @@
+
+Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K
+lines / 50 MByte for XML site map files.
+
+With a baseline of 100 million entities, that requires an index file pointing
+to at least 2000x individual sitemaps. 3 hex characters is 12 bits, or 4096
+options; seems like an ok granularity to start with.
+
+Should look in to what archive.org does to generate their sitemap.xml, seems
+simple, and comes in batches of exactly 50k.
+
+## Text Sitemaps
+
+Should be possible to create simple text-style sitemaps, one URL per line, and
+link to these from a sitemap index. This is appealing because the sitemaps can
+be generated very quickly from identifier SQL dump files, run through UNIX
+commands (eg, to split and turn into URLs). Some script to create an XML
+sitemap index to point at all the sitemaps would still be needed though.
+
+
+## Resources
+
+Google sitemap verifier: https://support.google.com/webmasters/answer/7451001
diff --git a/extra/sitemap/sitemap.xml b/extra/sitemap/sitemap.xml
new file mode 100644
index 00000000..4404bdc2
--- /dev/null
+++ b/extra/sitemap/sitemap.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+    <url>
+        <loc>{{page[0]|safe}}</loc>
+    </url>
+</urlset>
-- 
cgit v1.2.3


From 5f282a6267182214080ca36bcec4da1755589b46 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 19 Aug 2020 22:55:05 -0700
Subject: iterate on sitemap generation

---
 extra/sitemap/.gitignore                  |  3 +++
 extra/sitemap/README.md                   | 37 ++++++++++++++++++++++++++++++-
 extra/sitemap/container_url_lists.sh      | 23 +++++++++++++++++++
 extra/sitemap/generate_sitemap_indices.py | 28 +++++++++++++++++++++++
 extra/sitemap/release_url_lists.sh        | 29 ++++++++++++++++++++++++
 extra/sitemap/sitemap.xml                 |  6 -----
 6 files changed, 119 insertions(+), 7 deletions(-)
 create mode 100644 extra/sitemap/.gitignore
 create mode 100755 extra/sitemap/container_url_lists.sh
 create mode 100755 extra/sitemap/generate_sitemap_indices.py
 create mode 100755 extra/sitemap/release_url_lists.sh
 delete mode 100644 extra/sitemap/sitemap.xml

diff --git a/extra/sitemap/.gitignore b/extra/sitemap/.gitignore
new file mode 100644
index 00000000..5dd7dadc
--- /dev/null
+++ b/extra/sitemap/.gitignore
@@ -0,0 +1,3 @@
+*.txt.gz
+*.xml
+*.json.gz
diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md
index 6963bb1f..735ac925 100644
--- a/extra/sitemap/README.md
+++ b/extra/sitemap/README.md
@@ -1,6 +1,41 @@
 
+## Background
+
 Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K
-lines / 50 MByte for XML site map files.
+lines / 50 MByte for XML site map files. Google Scholar has indicated a smaller
+20k URL / 5 MB limit.
+
+For the time being, we will include only a subset of fatcat entities and pages
+in our sitemaps.
+
+- homepage, "about" pages
+- all container landing pages (~150k)
+- "best" release landing page for each work with fulltext (~25 million)
+
+In the short term, calculating "best" is tricky so let's just take the first
+release with fulltext per work.
+
+In tree form:
+
+- `/robots.txt`: static file (in web app)
+  - `/sitemap.xml`: about page, etc. static file (in web app)
+  - `/sitemap-containers-index.xml`: points to .txt URL lists; generated by scripts
+    - `/sitemap-containers-<date>-<shard>.txt`
+  - `/sitemap-releases-index.xml`: same as above
+    - `/sitemap-releases-<date>-<shard>.txt`
+
+Workflow:
+
+- run bash script over container dump, outputing compressed, sharded container sitemaps
+- run bash script over release work-grouped, outputing compressed, sharded release sitemaps
+- run python script to output top-level `sitemap.xml`
+- `scp` all of this into place
+
+To make this work, will configure an nginx rule to point all requests like
+`/sitemap-*` to the directory `/srv/fatcat/sitemap/`, and will collect output
+there.
+
+## Ideas on Huge (complete) Index
 
 With a baseline of 100 million entities, that requires an index file pointing
 to at least 2000x individual sitemaps. 3 hex characters is 12 bits, or 4096
diff --git a/extra/sitemap/container_url_lists.sh b/extra/sitemap/container_url_lists.sh
new file mode 100755
index 00000000..fcc0f4b6
--- /dev/null
+++ b/extra/sitemap/container_url_lists.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+set -e              # fail on error
+set -u              # fail if variable not set in substitution
+set -o pipefail     # fail if part of a '|' command fails
+
+: ${1?' You you did not supply a date argument'}
+: ${2?' You you did not supply an input file (JSON gzip)'}
+if [ ! -f $2 ] ; then
+  echo "Input file not found: $2" && exit 1;
+fi
+
+# eg, 2020-08-19
+DATE="$1"
+# eg, container_export.json.gz
+EXPORT_FILE_GZ="$2"
+
+zcat $EXPORT_FILE_GZ \
+    | jq .ident -r \
+    | awk '{print "https://fatcat.wiki/container/" $1 }' \
+    | split --lines 20000 - sitemap-containers-$DATE- -d -a 5 --additional-suffix .txt
+
+gzip sitemap-containers-*.txt
diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py
new file mode 100755
index 00000000..9766ac1f
--- /dev/null
+++ b/extra/sitemap/generate_sitemap_indices.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+import sys
+import glob
+import datetime
+
+def index_entity(entity_type, output):
+
+    now = datetime.datetime.now().isoformat()
+    print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output)
+    print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output)
+
+    for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"):
+        print("  <sitemap>", file=output)
+        print(f"    <loc>https://fatcat.wiki/{filename}</loc>", file=output)
+        print(f"    <lastmod>{now}</lastmod>", file=output)
+        print("  </sitemap>", file=output)
+
+    print("</sitemapindex>", file=output)
+
+def main():
+    with open('sitemap-index-containers.xml', 'w') as output:
+        index_entity("containers", output)
+    with open('sitemap-index-releases.xml', 'w') as output:
+        index_entity("releases", output)
+
+if __name__=="__main__":
+    main()
diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh
new file mode 100755
index 00000000..4190011f
--- /dev/null
+++ b/extra/sitemap/release_url_lists.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+set -e              # fail on error
+set -u              # fail if variable not set in substitution
+set -o pipefail     # fail if part of a '|' command fails
+
+: ${1?' You you did not supply a date argument'}
+: ${2?' You you did not supply an input file (JSON gzip)'}
+if [ -f $2 ] ; then
+  echo "Input file not found: $2" && exit 1;
+fi
+
+# eg, 2020-08-19
+DATE = "$1"
+# eg, release_export_expanded.json.gz
+EXPORT_FILE_GZ = "$2"
+
+# filter to fulltext releases only, then filter to only one hit per work
+zcat $EXPORT_FILE_GZ \
+    | rg '"release_ids"' \
+    | rg 'archive.org/' \
+    | rg -v '"stub"' \
+    | jq -r '[.work_id, .ident] | @tsv' \
+    | uniq -w 26 \
+    | cut -f 2 \
+    | awk '{print "https://fatcat.wiki/release/" $1 }' \
+    | split --lines 20000 - sitemap-releases-$DATE- -d -a 5 --additional-suffix .txt
+
+gzip sitemap-releases-*.txt
diff --git a/extra/sitemap/sitemap.xml b/extra/sitemap/sitemap.xml
deleted file mode 100644
index 4404bdc2..00000000
--- a/extra/sitemap/sitemap.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
-    <url>
-        <loc>{{page[0]|safe}}</loc>
-    </url>
-</urlset>
-- 
cgit v1.2.3


From c6b33542398c933a6272586e6280f7026b63a124 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 19 Aug 2020 23:00:34 -0700
Subject: update robots.txt and sitemap.xml

- show minimal robots/sitemap if not in prod environment
- default to allow all in robots.txt; link to sitemap index files
- basic sitemap.xml without entity-level links
---
 python/fatcat_web/routes.py                  | 15 +++++++++++++--
 python/fatcat_web/static/robots.deny_all.txt |  7 +++++++
 python/fatcat_web/static/robots.txt          | 19 +++++++++++++++++++
 python/fatcat_web/static/sitemap.xml         | 13 +++++++++++++
 4 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 python/fatcat_web/static/robots.deny_all.txt
 create mode 100644 python/fatcat_web/static/sitemap.xml

diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index da2bb6cf..c66c7f0c 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -1157,7 +1157,18 @@ def page_rfc():
     return render_template('rfc.html')
 
 @app.route('/robots.txt', methods=['GET'])
-def robots():
+def page_robots_txt():
+    if conf.FATCAT_DOMAIN == "fatcat.wiki":
+        robots_path = "robots.txt"
+    else:
+        robots_path = "robots.deny_all.txt"
     return send_from_directory(os.path.join(app.root_path, 'static'),
-                               'robots.txt',
+                               robots_path,
                                mimetype='text/plain')
+
+@app.route('/sitemap.xml', methods=['GET'])
+def page_sitemap_xml():
+    if conf.FATCAT_DOMAIN == "fatcat.wiki":
+        return redirect('/sitemaps/sitemap.xml')
+    else:
+        abort(404)
diff --git a/python/fatcat_web/static/robots.deny_all.txt b/python/fatcat_web/static/robots.deny_all.txt
new file mode 100644
index 00000000..b88274b1
--- /dev/null
+++ b/python/fatcat_web/static/robots.deny_all.txt
@@ -0,0 +1,7 @@
+# Hello friends!
+
+# You have found a QA/development instance of the Fatcat catalog. The canonical
+# location is https://fatcat.wiki, please crawl and index that location instead.
+
+User-agent: *
+Disallow: /
diff --git a/python/fatcat_web/static/robots.txt b/python/fatcat_web/static/robots.txt
index a168f11b..e89af36e 100644
--- a/python/fatcat_web/static/robots.txt
+++ b/python/fatcat_web/static/robots.txt
@@ -1 +1,20 @@
 # Hello friends!
+# If you are considering large or automated crawling, you may want to look at
+# our API (https://api.fatcat.wiki) or bulk database snapshots instead.
+
+# by default, can crawl anything on this domain. HTTP 429 ("backoff") status
+# codes are used for rate-limiting instead of any crawl delay specified here.
+# Up to a handful concurrent requests should be fine.
+User-agent: *
+Allow: /
+
+# crawling search result pages is expensive, so we do specify a long crawl delay for those
+User-agent: *
+Allow: /release/search
+Allow: /container/search
+Allow: /coverage/search
+Crawl-delay: 5
+
+Sitemap: https://fatcat.wiki/sitemap.xml
+Sitemap: https://fatcat.wiki/sitemap-index-releases.xml
+Sitemap: https://fatcat.wiki/sitemap-index-containers.xml
diff --git a/python/fatcat_web/static/sitemap.xml b/python/fatcat_web/static/sitemap.xml
new file mode 100644
index 00000000..e6189aa4
--- /dev/null
+++ b/python/fatcat_web/static/sitemap.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <!-- basic site pages -->
+  <url><loc>https://fatcat.wiki/</loc></url>
+  <url><loc>https://fatcat.wiki/about</loc></url>
+  <url><loc>https://fatcat.wiki/rfc</loc></url>
+  <url><loc>https://fatcat.wiki/stats</loc></url>
+  <url><loc>https://fatcat.wiki/changelog</loc></url>
+  <url><loc>https://fatcat.wiki/release/lookup</loc></url>
+  <url><loc>https://fatcat.wiki/container/lookup</loc></url>
+  <url><loc>https://fatcat.wiki/file/lookup</loc></url>
+  <!-- additional entity-level URL lists are linked from robots.txt -->
+</urlset>
-- 
cgit v1.2.3


From c15cbf3568f7d91774e1cb82a39474c0ff874616 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 19 Aug 2020 23:57:31 -0700
Subject: sitemap fixes from testing

---
 extra/sitemap/README.md                   | 11 +++++++++++
 extra/sitemap/generate_sitemap_indices.py |  2 +-
 extra/sitemap/release_url_lists.sh        |  6 +++---
 python/fatcat_web/routes.py               | 10 +++++-----
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md
index 735ac925..f72893cd 100644
--- a/extra/sitemap/README.md
+++ b/extra/sitemap/README.md
@@ -1,4 +1,15 @@
 
+## HOWTO: Update
+
+After a container dump, as `fatcat` user on prod server:
+
+    cd /srv/fatcat/sitemap
+    export DATE=`date --iso-8601` # or whatever
+    /srv/fatcat/src/extra/sitemap/container_url_lists.sh $DATE /srv/fatcat/snapshots/container_export.json.gz
+    /srv/fatcat/src/extra/sitemap/release_url_lists.sh $DATE /srv/fatcat/snapshots/release_export_expanded.json.gz
+    # delete old sitemap url lists
+    /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py
+
 ## Background
 
 Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K
diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py
index 9766ac1f..0a5624a1 100755
--- a/extra/sitemap/generate_sitemap_indices.py
+++ b/extra/sitemap/generate_sitemap_indices.py
@@ -6,7 +6,7 @@ import datetime
 
 def index_entity(entity_type, output):
 
-    now = datetime.datetime.now().isoformat()
+    now = datetime.date.today().isoformat()
     print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output)
     print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output)
 
diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh
index 4190011f..d5c8d4ef 100755
--- a/extra/sitemap/release_url_lists.sh
+++ b/extra/sitemap/release_url_lists.sh
@@ -6,14 +6,14 @@ set -o pipefail     # fail if part of a '|' command fails
 
 : ${1?' You you did not supply a date argument'}
 : ${2?' You you did not supply an input file (JSON gzip)'}
-if [ -f $2 ] ; then
+if [ ! -f $2 ] ; then
   echo "Input file not found: $2" && exit 1;
 fi
 
 # eg, 2020-08-19
-DATE = "$1"
+DATE="$1"
 # eg, release_export_expanded.json.gz
-EXPORT_FILE_GZ = "$2"
+EXPORT_FILE_GZ="$2"
 
 # filter to fulltext releases only, then filter to only one hit per work
 zcat $EXPORT_FILE_GZ \
diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py
index c66c7f0c..9ae2eaa9 100644
--- a/python/fatcat_web/routes.py
+++ b/python/fatcat_web/routes.py
@@ -1158,7 +1158,7 @@ def page_rfc():
 
 @app.route('/robots.txt', methods=['GET'])
 def page_robots_txt():
-    if conf.FATCAT_DOMAIN == "fatcat.wiki":
+    if app.config['FATCAT_DOMAIN'] == "fatcat.wiki":
         robots_path = "robots.txt"
     else:
         robots_path = "robots.deny_all.txt"
@@ -1168,7 +1168,7 @@ def page_robots_txt():
 
 @app.route('/sitemap.xml', methods=['GET'])
 def page_sitemap_xml():
-    if conf.FATCAT_DOMAIN == "fatcat.wiki":
-        return redirect('/sitemaps/sitemap.xml')
-    else:
-        abort(404)
+    return send_from_directory(os.path.join(app.root_path, 'static'),
+                               "sitemap.xml",
+                               mimetype='text/xml')
+
-- 
cgit v1.2.3


From 2a98d10be1cc1368f9510745bff07c343974d4a7 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 20 Aug 2020 00:13:21 -0700
Subject: fix SearchAction nesting in WebSite (schema.org)

This is not related to sitemap changes, but I was reminded in google
search tools when validating site.
---
 python/fatcat_web/templates/home.html | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/fatcat_web/templates/home.html b/python/fatcat_web/templates/home.html
index de32d6a4..7ffa64ca 100644
--- a/python/fatcat_web/templates/home.html
+++ b/python/fatcat_web/templates/home.html
@@ -8,12 +8,9 @@
 {% endblock %}
 
 {% block fullmain %}
-<!--
-<div class="ui container text" itemscope itemtype="https://schema.org/WebSite">
-<meta itemprop="url" content="https://{{ config.FATCAT_DOMAIN }}/"/>
--->
 
-<div class ="ui vertical inverted masthead center aligned segment" style="padding-top: 12em; padding-bottom: 10em;">
+<div class ="ui vertical inverted masthead center aligned segment" style="padding-top: 12em; padding-bottom: 10em;" itemscope itemtype="https://schema.org/WebSite">
+  <link itemprop="url" content="https://{{ config.FATCAT_DOMAIN }}/"/>
   <div class="ui text container">
     <h1 class="ui header inverted huge centered">Perpetual Access to Millions of Open Research Publications From Around The World</h1>
     <br>
-- 
cgit v1.2.3