diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-04-07 18:54:25 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-04-07 18:54:25 -0700 |
commit | 26860b70a58c5e413f4b607ad304b17ebe9aced8 (patch) | |
tree | ea5084e3eeb0dfd4ca684c5895fee0108ae8ab5a /extra | |
parent | 97280d0a20baa00aa1f8dbd3bec62142ad2ce900 (diff) | |
download | fatcat-26860b70a58c5e413f4b607ad304b17ebe9aced8.tar.gz fatcat-26860b70a58c5e413f4b607ad304b17ebe9aced8.zip |
sitemaps: filter to releases with PDF fulltext (for now)
Diffstat (limited to 'extra')
-rwxr-xr-x | extra/sitemap/release_url_lists.sh | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh index d5c8d4ef..280ecab1 100755 --- a/extra/sitemap/release_url_lists.sh +++ b/extra/sitemap/release_url_lists.sh @@ -19,6 +19,8 @@ EXPORT_FILE_GZ="$2" zcat $EXPORT_FILE_GZ \ | rg '"release_ids"' \ | rg 'archive.org/' \ + | rg 'application/pdf' \ + | rg '"url":' \ | rg -v '"stub"' \ | jq -r '[.work_id, .ident] | @tsv' \ | uniq -w 26 \ |