diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-04-29 10:03:47 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-04-29 10:03:47 -0700 |
commit | 6b1f87c12f7d40a3016910b214579a368c747df4 (patch) | |
tree | d52e555663a3fe395fe0024098735adaf8e10494 /extra/sitemap/work_urls_query.sh | |
parent | 4b152e02d1a0d0d7a9a391ed211ecd6f304d6962 (diff) | |
download | fatcat-scholar-6b1f87c12f7d40a3016910b214579a368c747df4.tar.gz fatcat-scholar-6b1f87c12f7d40a3016910b214579a368c747df4.zip |
sitemap generation
Diffstat (limited to 'extra/sitemap/work_urls_query.sh')
-rwxr-xr-x | extra/sitemap/work_urls_query.sh | 21 |
1 files changed, 21 insertions, 0 deletions
diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh new file mode 100755 index 0000000..c02eb74 --- /dev/null +++ b/extra/sitemap/work_urls_query.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +: ${1?' You you did not supply a date argument'} + +# eg, 2020-08-19 +DATE="$1" + +# query for specific works + +fatcat-cli search scholar "doc_type:work (fulltext.access_type:ia_file OR fulltext.access_type:wayback) (year:<1925 OR publisher_type:longtail OR publisher_type:oa)" --index-json --limit 0 \ + | pv -l \ + | jq .key -r \ + | tr '_' '/' \ + | awk '{print "https://scholar.archive.org/" $1}' \ + | split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt + +gzip sitemap-works-*.txt |