From 26860b70a58c5e413f4b607ad304b17ebe9aced8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 7 Apr 2021 18:54:25 -0700 Subject: sitemaps: filter to releases with PDF fulltext (for now) --- extra/sitemap/release_url_lists.sh | 2 ++ 1 file changed, 2 insertions(+) (limited to 'extra/sitemap') diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh index d5c8d4ef..280ecab1 100755 --- a/extra/sitemap/release_url_lists.sh +++ b/extra/sitemap/release_url_lists.sh @@ -19,6 +19,8 @@ EXPORT_FILE_GZ="$2" zcat $EXPORT_FILE_GZ \ | rg '"release_ids"' \ | rg 'archive.org/' \ + | rg 'application/pdf' \ + | rg '"url":' \ | rg -v '"stub"' \ | jq -r '[.work_id, .ident] | @tsv' \ | uniq -w 26 \ -- cgit v1.2.3