summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-04-07 18:54:25 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-04-07 18:54:25 -0700
commit26860b70a58c5e413f4b607ad304b17ebe9aced8 (patch)
treeea5084e3eeb0dfd4ca684c5895fee0108ae8ab5a
parent97280d0a20baa00aa1f8dbd3bec62142ad2ce900 (diff)
downloadfatcat-26860b70a58c5e413f4b607ad304b17ebe9aced8.tar.gz
fatcat-26860b70a58c5e413f4b607ad304b17ebe9aced8.zip
sitemaps: filter to releases with PDF fulltext (for now)
-rwxr-xr-xextra/sitemap/release_url_lists.sh2
1 files changed, 2 insertions, 0 deletions
diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh
index d5c8d4ef..280ecab1 100755
--- a/extra/sitemap/release_url_lists.sh
+++ b/extra/sitemap/release_url_lists.sh
@@ -19,6 +19,8 @@ EXPORT_FILE_GZ="$2"
zcat $EXPORT_FILE_GZ \
| rg '"release_ids"' \
| rg 'archive.org/' \
+ | rg 'application/pdf' \
+ | rg '"url":' \
| rg -v '"stub"' \
| jq -r '[.work_id, .ident] | @tsv' \
| uniq -w 26 \