diff options
Diffstat (limited to 'extra/sitemap/work_urls_query.sh')
-rwxr-xr-x | extra/sitemap/work_urls_query.sh | 21 |
1 files changed, 21 insertions, 0 deletions
diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh new file mode 100755 index 0000000..c02eb74 --- /dev/null +++ b/extra/sitemap/work_urls_query.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +: ${1?' You you did not supply a date argument'} + +# eg, 2020-08-19 +DATE="$1" + +# query for specific works + +fatcat-cli search scholar "doc_type:work (fulltext.access_type:ia_file OR fulltext.access_type:wayback) (year:<1925 OR publisher_type:longtail OR publisher_type:oa)" --index-json --limit 0 \ + | pv -l \ + | jq .key -r \ + | tr '_' '/' \ + | awk '{print "https://scholar.archive.org/" $1}' \ + | split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt + +gzip sitemap-works-*.txt |