aboutsummaryrefslogtreecommitdiffstats
path: root/extra/sitemap
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-08-19 23:57:31 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-08-19 23:57:31 -0700
commitc15cbf3568f7d91774e1cb82a39474c0ff874616 (patch)
treebe5dffb5f735ab497c9affe155fd82267ed4b412 /extra/sitemap
parentc6b33542398c933a6272586e6280f7026b63a124 (diff)
downloadfatcat-c15cbf3568f7d91774e1cb82a39474c0ff874616.tar.gz
fatcat-c15cbf3568f7d91774e1cb82a39474c0ff874616.zip
sitemap fixes from testing
Diffstat (limited to 'extra/sitemap')
-rw-r--r--extra/sitemap/README.md11
-rwxr-xr-xextra/sitemap/generate_sitemap_indices.py2
-rwxr-xr-xextra/sitemap/release_url_lists.sh6
3 files changed, 15 insertions, 4 deletions
diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md
index 735ac925..f72893cd 100644
--- a/extra/sitemap/README.md
+++ b/extra/sitemap/README.md
@@ -1,4 +1,15 @@
+## HOWTO: Update
+
+After a container dump, as `fatcat` user on prod server:
+
+ cd /srv/fatcat/sitemap
+ export DATE=`date --iso-8601` # or whatever
+ /srv/fatcat/src/extra/sitemap/container_url_lists.sh $DATE /srv/fatcat/snapshots/container_export.json.gz
+ /srv/fatcat/src/extra/sitemap/release_url_lists.sh $DATE /srv/fatcat/snapshots/release_export_expanded.json.gz
+ # delete old sitemap url lists
+ /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py
+
## Background
Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K
diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py
index 9766ac1f..0a5624a1 100755
--- a/extra/sitemap/generate_sitemap_indices.py
+++ b/extra/sitemap/generate_sitemap_indices.py
@@ -6,7 +6,7 @@ import datetime
def index_entity(entity_type, output):
- now = datetime.datetime.now().isoformat()
+ now = datetime.date.today().isoformat()
print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output)
print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output)
diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh
index 4190011f..d5c8d4ef 100755
--- a/extra/sitemap/release_url_lists.sh
+++ b/extra/sitemap/release_url_lists.sh
@@ -6,14 +6,14 @@ set -o pipefail # fail if part of a '|' command fails
: ${1?' You you did not supply a date argument'}
: ${2?' You you did not supply an input file (JSON gzip)'}
-if [ -f $2 ] ; then
+if [ ! -f $2 ] ; then
echo "Input file not found: $2" && exit 1;
fi
# eg, 2020-08-19
-DATE = "$1"
+DATE="$1"
# eg, release_export_expanded.json.gz
-EXPORT_FILE_GZ = "$2"
+EXPORT_FILE_GZ="$2"
# filter to fulltext releases only, then filter to only one hit per work
zcat $EXPORT_FILE_GZ \