diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-19 23:57:31 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-19 23:57:31 -0700 | 
| commit | c15cbf3568f7d91774e1cb82a39474c0ff874616 (patch) | |
| tree | be5dffb5f735ab497c9affe155fd82267ed4b412 /extra | |
| parent | c6b33542398c933a6272586e6280f7026b63a124 (diff) | |
| download | fatcat-c15cbf3568f7d91774e1cb82a39474c0ff874616.tar.gz fatcat-c15cbf3568f7d91774e1cb82a39474c0ff874616.zip | |
sitemap fixes from testing
Diffstat (limited to 'extra')
| -rw-r--r-- | extra/sitemap/README.md | 11 | ||||
| -rwxr-xr-x | extra/sitemap/generate_sitemap_indices.py | 2 | ||||
| -rwxr-xr-x | extra/sitemap/release_url_lists.sh | 6 | 
3 files changed, 15 insertions, 4 deletions
| diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md index 735ac925..f72893cd 100644 --- a/extra/sitemap/README.md +++ b/extra/sitemap/README.md @@ -1,4 +1,15 @@ +## HOWTO: Update + +After a container dump, as `fatcat` user on prod server: + +    cd /srv/fatcat/sitemap +    export DATE=`date --iso-8601` # or whatever +    /srv/fatcat/src/extra/sitemap/container_url_lists.sh $DATE /srv/fatcat/snapshots/container_export.json.gz +    /srv/fatcat/src/extra/sitemap/release_url_lists.sh $DATE /srv/fatcat/snapshots/release_export_expanded.json.gz +    # delete old sitemap url lists +    /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py +  ## Background  Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py index 9766ac1f..0a5624a1 100755 --- a/extra/sitemap/generate_sitemap_indices.py +++ b/extra/sitemap/generate_sitemap_indices.py @@ -6,7 +6,7 @@ import datetime  def index_entity(entity_type, output): -    now = datetime.datetime.now().isoformat() +    now = datetime.date.today().isoformat()      print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output)      print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output) diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh index 4190011f..d5c8d4ef 100755 --- a/extra/sitemap/release_url_lists.sh +++ b/extra/sitemap/release_url_lists.sh @@ -6,14 +6,14 @@ set -o pipefail     # fail if part of a '|' command fails  : ${1?' You you did not supply a date argument'}  : ${2?' You you did not supply an input file (JSON gzip)'} -if [ -f $2 ] ; then +if [ ! -f $2 ] ; then    echo "Input file not found: $2" && exit 1;  fi  # eg, 2020-08-19 -DATE = "$1" +DATE="$1"  # eg, release_export_expanded.json.gz -EXPORT_FILE_GZ = "$2" +EXPORT_FILE_GZ="$2"  # filter to fulltext releases only, then filter to only one hit per work  zcat $EXPORT_FILE_GZ \ | 
