From 88a99387e09c7c43803129e72215ef3f6b4cafc6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 7 May 2019 17:30:10 -0700 Subject: initial sitemap.xml notes/template --- extra/sitemap/README.md | 23 +++++++++++++++++++++++ extra/sitemap/sitemap.xml | 6 ++++++ 2 files changed, 29 insertions(+) create mode 100644 extra/sitemap/README.md create mode 100644 extra/sitemap/sitemap.xml (limited to 'extra') diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md new file mode 100644 index 00000000..6963bb1f --- /dev/null +++ b/extra/sitemap/README.md @@ -0,0 +1,23 @@ + +Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K +lines / 50 MByte for XML site map files. + +With a baseline of 100 million entities, that requires an index file pointing +to at least 2000x individual sitemaps. 3 hex characters is 12 bits, or 4096 +options; seems like an ok granularity to start with. + +Should look in to what archive.org does to generate their sitemap.xml, seems +simple, and comes in batches of exactly 50k. + +## Text Sitemaps + +Should be possible to create simple text-style sitemaps, one URL per line, and +link to these from a sitemap index. This is appealing because the sitemaps can +be generated very quickly from identifier SQL dump files, run through UNIX +commands (eg, to split and turn into URLs). Some script to create an XML +sitemap index to point at all the sitemaps would still be needed though. + + +## Resources + +Google sitemap verifier: https://support.google.com/webmasters/answer/7451001 diff --git a/extra/sitemap/sitemap.xml b/extra/sitemap/sitemap.xml new file mode 100644 index 00000000..4404bdc2 --- /dev/null +++ b/extra/sitemap/sitemap.xml @@ -0,0 +1,6 @@ + + + + {{page[0]|safe}} + + -- cgit v1.2.3 From 5f282a6267182214080ca36bcec4da1755589b46 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 19 Aug 2020 22:55:05 -0700 Subject: iterate on sitemap generation --- extra/sitemap/.gitignore | 3 +++ extra/sitemap/README.md | 37 ++++++++++++++++++++++++++++++- extra/sitemap/container_url_lists.sh | 23 +++++++++++++++++++ extra/sitemap/generate_sitemap_indices.py | 28 +++++++++++++++++++++++ extra/sitemap/release_url_lists.sh | 29 ++++++++++++++++++++++++ extra/sitemap/sitemap.xml | 6 ----- 6 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 extra/sitemap/.gitignore create mode 100755 extra/sitemap/container_url_lists.sh create mode 100755 extra/sitemap/generate_sitemap_indices.py create mode 100755 extra/sitemap/release_url_lists.sh delete mode 100644 extra/sitemap/sitemap.xml (limited to 'extra') diff --git a/extra/sitemap/.gitignore b/extra/sitemap/.gitignore new file mode 100644 index 00000000..5dd7dadc --- /dev/null +++ b/extra/sitemap/.gitignore @@ -0,0 +1,3 @@ +*.txt.gz +*.xml +*.json.gz diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md index 6963bb1f..735ac925 100644 --- a/extra/sitemap/README.md +++ b/extra/sitemap/README.md @@ -1,6 +1,41 @@ +## Background + Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K -lines / 50 MByte for XML site map files. +lines / 50 MByte for XML site map files. Google Scholar has indicated a smaller +20k URL / 5 MB limit. + +For the time being, we will include only a subset of fatcat entities and pages +in our sitemaps. + +- homepage, "about" pages +- all container landing pages (~150k) +- "best" release landing page for each work with fulltext (~25 million) + +In the short term, calculating "best" is tricky so let's just take the first +release with fulltext per work. + +In tree form: + +- `/robots.txt`: static file (in web app) + - `/sitemap.xml`: about page, etc. static file (in web app) + - `/sitemap-containers-index.xml`: points to .txt URL lists; generated by scripts + - `/sitemap-containers--.txt` + - `/sitemap-releases-index.xml`: same as above + - `/sitemap-releases--.txt` + +Workflow: + +- run bash script over container dump, outputing compressed, sharded container sitemaps +- run bash script over release work-grouped, outputing compressed, sharded release sitemaps +- run python script to output top-level `sitemap.xml` +- `scp` all of this into place + +To make this work, will configure an nginx rule to point all requests like +`/sitemap-*` to the directory `/srv/fatcat/sitemap/`, and will collect output +there. + +## Ideas on Huge (complete) Index With a baseline of 100 million entities, that requires an index file pointing to at least 2000x individual sitemaps. 3 hex characters is 12 bits, or 4096 diff --git a/extra/sitemap/container_url_lists.sh b/extra/sitemap/container_url_lists.sh new file mode 100755 index 00000000..fcc0f4b6 --- /dev/null +++ b/extra/sitemap/container_url_lists.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +: ${1?' You you did not supply a date argument'} +: ${2?' You you did not supply an input file (JSON gzip)'} +if [ ! -f $2 ] ; then + echo "Input file not found: $2" && exit 1; +fi + +# eg, 2020-08-19 +DATE="$1" +# eg, container_export.json.gz +EXPORT_FILE_GZ="$2" + +zcat $EXPORT_FILE_GZ \ + | jq .ident -r \ + | awk '{print "https://fatcat.wiki/container/" $1 }' \ + | split --lines 20000 - sitemap-containers-$DATE- -d -a 5 --additional-suffix .txt + +gzip sitemap-containers-*.txt diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py new file mode 100755 index 00000000..9766ac1f --- /dev/null +++ b/extra/sitemap/generate_sitemap_indices.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import sys +import glob +import datetime + +def index_entity(entity_type, output): + + now = datetime.datetime.now().isoformat() + print("""""", file=output) + print("""""", file=output) + + for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"): + print(" ", file=output) + print(f" https://fatcat.wiki/{filename}", file=output) + print(f" {now}", file=output) + print(" ", file=output) + + print("", file=output) + +def main(): + with open('sitemap-index-containers.xml', 'w') as output: + index_entity("containers", output) + with open('sitemap-index-releases.xml', 'w') as output: + index_entity("releases", output) + +if __name__=="__main__": + main() diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh new file mode 100755 index 00000000..4190011f --- /dev/null +++ b/extra/sitemap/release_url_lists.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +: ${1?' You you did not supply a date argument'} +: ${2?' You you did not supply an input file (JSON gzip)'} +if [ -f $2 ] ; then + echo "Input file not found: $2" && exit 1; +fi + +# eg, 2020-08-19 +DATE = "$1" +# eg, release_export_expanded.json.gz +EXPORT_FILE_GZ = "$2" + +# filter to fulltext releases only, then filter to only one hit per work +zcat $EXPORT_FILE_GZ \ + | rg '"release_ids"' \ + | rg 'archive.org/' \ + | rg -v '"stub"' \ + | jq -r '[.work_id, .ident] | @tsv' \ + | uniq -w 26 \ + | cut -f 2 \ + | awk '{print "https://fatcat.wiki/release/" $1 }' \ + | split --lines 20000 - sitemap-releases-$DATE- -d -a 5 --additional-suffix .txt + +gzip sitemap-releases-*.txt diff --git a/extra/sitemap/sitemap.xml b/extra/sitemap/sitemap.xml deleted file mode 100644 index 4404bdc2..00000000 --- a/extra/sitemap/sitemap.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - {{page[0]|safe}} - - -- cgit v1.2.3 From c15cbf3568f7d91774e1cb82a39474c0ff874616 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 19 Aug 2020 23:57:31 -0700 Subject: sitemap fixes from testing --- extra/sitemap/README.md | 11 +++++++++++ extra/sitemap/generate_sitemap_indices.py | 2 +- extra/sitemap/release_url_lists.sh | 6 +++--- python/fatcat_web/routes.py | 10 +++++----- 4 files changed, 20 insertions(+), 9 deletions(-) (limited to 'extra') diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md index 735ac925..f72893cd 100644 --- a/extra/sitemap/README.md +++ b/extra/sitemap/README.md @@ -1,4 +1,15 @@ +## HOWTO: Update + +After a container dump, as `fatcat` user on prod server: + + cd /srv/fatcat/sitemap + export DATE=`date --iso-8601` # or whatever + /srv/fatcat/src/extra/sitemap/container_url_lists.sh $DATE /srv/fatcat/snapshots/container_export.json.gz + /srv/fatcat/src/extra/sitemap/release_url_lists.sh $DATE /srv/fatcat/snapshots/release_export_expanded.json.gz + # delete old sitemap url lists + /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py + ## Background Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py index 9766ac1f..0a5624a1 100755 --- a/extra/sitemap/generate_sitemap_indices.py +++ b/extra/sitemap/generate_sitemap_indices.py @@ -6,7 +6,7 @@ import datetime def index_entity(entity_type, output): - now = datetime.datetime.now().isoformat() + now = datetime.date.today().isoformat() print("""""", file=output) print("""""", file=output) diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh index 4190011f..d5c8d4ef 100755 --- a/extra/sitemap/release_url_lists.sh +++ b/extra/sitemap/release_url_lists.sh @@ -6,14 +6,14 @@ set -o pipefail # fail if part of a '|' command fails : ${1?' You you did not supply a date argument'} : ${2?' You you did not supply an input file (JSON gzip)'} -if [ -f $2 ] ; then +if [ ! -f $2 ] ; then echo "Input file not found: $2" && exit 1; fi # eg, 2020-08-19 -DATE = "$1" +DATE="$1" # eg, release_export_expanded.json.gz -EXPORT_FILE_GZ = "$2" +EXPORT_FILE_GZ="$2" # filter to fulltext releases only, then filter to only one hit per work zcat $EXPORT_FILE_GZ \ diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index c66c7f0c..9ae2eaa9 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -1158,7 +1158,7 @@ def page_rfc(): @app.route('/robots.txt', methods=['GET']) def page_robots_txt(): - if conf.FATCAT_DOMAIN == "fatcat.wiki": + if app.config['FATCAT_DOMAIN'] == "fatcat.wiki": robots_path = "robots.txt" else: robots_path = "robots.deny_all.txt" @@ -1168,7 +1168,7 @@ def page_robots_txt(): @app.route('/sitemap.xml', methods=['GET']) def page_sitemap_xml(): - if conf.FATCAT_DOMAIN == "fatcat.wiki": - return redirect('/sitemaps/sitemap.xml') - else: - abort(404) + return send_from_directory(os.path.join(app.root_path, 'static'), + "sitemap.xml", + mimetype='text/xml') + -- cgit v1.2.3