From 6b1f87c12f7d40a3016910b214579a368c747df4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 29 Apr 2021 10:03:47 -0700 Subject: sitemap generation --- extra/sitemap/.gitignore | 3 +++ extra/sitemap/README.md | 21 +++++++++++++++++++++ extra/sitemap/generate_sitemap_indices.py | 26 ++++++++++++++++++++++++++ extra/sitemap/work_urls_query.sh | 21 +++++++++++++++++++++ 4 files changed, 71 insertions(+) create mode 100644 extra/sitemap/.gitignore create mode 100644 extra/sitemap/README.md create mode 100755 extra/sitemap/generate_sitemap_indices.py create mode 100755 extra/sitemap/work_urls_query.sh (limited to 'extra') diff --git a/extra/sitemap/.gitignore b/extra/sitemap/.gitignore new file mode 100644 index 0000000..5dd7dad --- /dev/null +++ b/extra/sitemap/.gitignore @@ -0,0 +1,3 @@ +*.txt.gz +*.xml +*.json.gz diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md new file mode 100644 index 0000000..242378a --- /dev/null +++ b/extra/sitemap/README.md @@ -0,0 +1,21 @@ + +## HOWTO: Update + +Requires [fatcat-cli](https://gitlab.com/bnewbold/fatcat-cli) and `jq` +installed. Run these commands on a production machine. + + cd /srv/fatcat_scholar/sitemap + export DATE=`date --iso-8601` + /srv/fatcat_scholar/src/extra/sitemap/work_urls_query.sh $DATE + rm *.txt.gz + /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py + +## Background + +Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K +lines / 50 MByte for XML site map files. Google Scholar has indicated a smaller +20k URL / 5 MB limit. + +## Resources + +Google sitemap verifier: https://support.google.com/webmasters/answer/7451001 diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py new file mode 100755 index 0000000..5b5cad2 --- /dev/null +++ b/extra/sitemap/generate_sitemap_indices.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +import sys +import glob +import datetime + +def index_entity(entity_type, output): + + now = datetime.date.today().isoformat() + print("""""", file=output) + print("""""", file=output) + + for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"): + print(" ", file=output) + print(f" https://scholar.archive.org/{filename}", file=output) + print(f" {now}", file=output) + print(" ", file=output) + + print("", file=output) + +def main(): + with open('sitemap-index-works.xml', 'w') as output: + index_entity("works", output) + +if __name__=="__main__": + main() diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh new file mode 100755 index 0000000..c02eb74 --- /dev/null +++ b/extra/sitemap/work_urls_query.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +: ${1?' You you did not supply a date argument'} + +# eg, 2020-08-19 +DATE="$1" + +# query for specific works + +fatcat-cli search scholar "doc_type:work (fulltext.access_type:ia_file OR fulltext.access_type:wayback) (year:<1925 OR publisher_type:longtail OR publisher_type:oa)" --index-json --limit 0 \ + | pv -l \ + | jq .key -r \ + | tr '_' '/' \ + | awk '{print "https://scholar.archive.org/" $1}' \ + | split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt + +gzip sitemap-works-*.txt -- cgit v1.2.3