aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-05-18 11:14:54 -0700
committerBryan Newbold <bnewbold@archive.org>2021-05-18 11:14:54 -0700
commite72408b48c1f32c4b5b34bd3861584d98b362255 (patch)
treeeb08e621aa1e2b9ae8764493cf9e5235dfa2c0bd
parent9c51894168032a40f2b595208db0ded0baee24bb (diff)
downloadfatcat-scholar-e72408b48c1f32c4b5b34bd3861584d98b362255.tar.gz
fatcat-scholar-e72408b48c1f32c4b5b34bd3861584d98b362255.zip
sitemaps: PDF sitemaps
-rw-r--r--extra/sitemap/.gitignore1
-rw-r--r--extra/sitemap/README.md1
-rwxr-xr-xextra/sitemap/generate_sitemap_indices.py2
-rwxr-xr-xextra/sitemap/pdf_urls_query.sh14
-rwxr-xr-xextra/sitemap/transform_access_url.py24
5 files changed, 42 insertions, 0 deletions
diff --git a/extra/sitemap/.gitignore b/extra/sitemap/.gitignore
index 5dd7dad..2c2b788 100644
--- a/extra/sitemap/.gitignore
+++ b/extra/sitemap/.gitignore
@@ -1,3 +1,4 @@
*.txt.gz
+*.txt
*.xml
*.json.gz
diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md
index 055575f..1a24620 100644
--- a/extra/sitemap/README.md
+++ b/extra/sitemap/README.md
@@ -6,6 +6,7 @@ installed. Run these commands on a production machine.
cd /srv/fatcat_scholar/sitemap
/srv/fatcat_scholar/src/extra/sitemap/work_urls_query.sh
+ /srv/fatcat_scholar/src/extra/sitemap/pdf_urls_query.sh
/srv/fatcat_scholar/src/extra/sitemap/generate_sitemap_indices.py
## Background
diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py
index f1ec494..ed4b38b 100755
--- a/extra/sitemap/generate_sitemap_indices.py
+++ b/extra/sitemap/generate_sitemap_indices.py
@@ -21,6 +21,8 @@ def index_entity(entity_type, output):
def main():
with open('sitemap-index-works.xml', 'w') as output:
index_entity("works", output)
+ with open('sitemap-index-pdfs.xml', 'w') as output:
+ index_entity("pdfs", output)
if __name__=="__main__":
main()
diff --git a/extra/sitemap/pdf_urls_query.sh b/extra/sitemap/pdf_urls_query.sh
new file mode 100755
index 0000000..fb1a4b8
--- /dev/null
+++ b/extra/sitemap/pdf_urls_query.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+
+# query for specific works; about 8.6 million circa 2021-04-29
+fatcat-cli search scholar 'doc_type:work (fulltext.access_type:ia_file OR fulltext.access_type:wayback) (NOT biblio.arxiv_id:*) (NOT biblio.pmcid:*) (NOT biblio.publisher_type:big5) (year:<1926 OR tags:*)' --index-json --limit 0 \
+ | pv -l \
+ | jq '[.fulltext.access_type, .fulltext.access_url] | @tsv' -r \
+ | rg -v '^null' \
+ | ./transform_access_url.py \
+ | split --lines 20000 - sitemap-pdfs- -d -a 5 --additional-suffix .txt
diff --git a/extra/sitemap/transform_access_url.py b/extra/sitemap/transform_access_url.py
new file mode 100755
index 0000000..b00bd82
--- /dev/null
+++ b/extra/sitemap/transform_access_url.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+import sys
+
+# NOTE: copied from fatcat_scholar/hacks.py
+def make_access_redirect_url(access_type: str, access_url: str) -> str:
+ if access_type == "wayback" and "://web.archive.org/" in access_url:
+ segments = access_url.split("/")
+ dt = segments[4]
+ original_url = "/".join(segments[5:])
+ return f"https://scholar.archive.org/access/wayback/{dt}/{original_url}"
+ elif access_type == "ia_file" and "://archive.org/download/" in access_url:
+ suffix = "/".join(access_url.split("/")[4:])
+ return f"https://scholar.archive.org/access/ia_file/{suffix}"
+ else:
+ return access_url
+
+def run() -> None:
+ for line in sys.stdin:
+ (access_type, access_url) = line.strip().split('\t')
+ print(make_access_redirect_url(access_type, access_url))
+
+if __name__ == "__main__":
+ run()