summaryrefslogtreecommitdiffstats
path: root/extra/sitemap/release_url_lists.sh
blob: 280ecab17aa0fa9b5dca3619e084758820cbd4e7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/usr/bin/env bash

set -e              # fail on error
set -u              # fail if variable not set in substitution
set -o pipefail     # fail if part of a '|' command fails

: ${1?' You you did not supply a date argument'}
: ${2?' You you did not supply an input file (JSON gzip)'}
if [ ! -f $2 ] ; then
  echo "Input file not found: $2" && exit 1;
fi

# eg, 2020-08-19
DATE="$1"
# eg, release_export_expanded.json.gz
EXPORT_FILE_GZ="$2"

# filter to fulltext releases only, then filter to only one hit per work
zcat $EXPORT_FILE_GZ \
    | rg '"release_ids"' \
    | rg 'archive.org/' \
    | rg 'application/pdf' \
    | rg '"url":' \
    | rg -v '"stub"' \
    | jq -r '[.work_id, .ident] | @tsv' \
    | uniq -w 26 \
    | cut -f 2 \
    | awk '{print "https://fatcat.wiki/release/" $1 }' \
    | split --lines 20000 - sitemap-releases-$DATE- -d -a 5 --additional-suffix .txt

gzip sitemap-releases-*.txt