From 6b1f87c12f7d40a3016910b214579a368c747df4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 29 Apr 2021 10:03:47 -0700 Subject: sitemap generation --- extra/sitemap/work_urls_query.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100755 extra/sitemap/work_urls_query.sh (limited to 'extra/sitemap/work_urls_query.sh') diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh new file mode 100755 index 0000000..c02eb74 --- /dev/null +++ b/extra/sitemap/work_urls_query.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +: ${1?' You you did not supply a date argument'} + +# eg, 2020-08-19 +DATE="$1" + +# query for specific works + +fatcat-cli search scholar "doc_type:work (fulltext.access_type:ia_file OR fulltext.access_type:wayback) (year:<1925 OR publisher_type:longtail OR publisher_type:oa)" --index-json --limit 0 \ + | pv -l \ + | jq .key -r \ + | tr '_' '/' \ + | awk '{print "https://scholar.archive.org/" $1}' \ + | split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt + +gzip sitemap-works-*.txt -- cgit v1.2.3