From c367d54fe47cf71ada73fa9ad16495824e07abfc Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 30 Apr 2021 14:17:12 -0700 Subject: sitemaps: not gzip compressed --- extra/sitemap/README.md | 2 -- extra/sitemap/generate_sitemap_indices.py | 2 +- extra/sitemap/work_urls_query.sh | 2 -- 3 files changed, 1 insertion(+), 5 deletions(-) (limited to 'extra') diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md index 1e1938a..6c03095 100644 --- a/extra/sitemap/README.md +++ b/extra/sitemap/README.md @@ -7,8 +7,6 @@ installed. Run these commands on a production machine. cd /srv/fatcat_scholar/sitemap export DATE=`date --iso-8601` /srv/fatcat_scholar/src/extra/sitemap/work_urls_query.sh $DATE - rm *.txt.gz - gzip sitemap-*.txt /srv/fatcat_scholar/src/extra/sitemap/generate_sitemap_indices.py ## Background diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py index 5b5cad2..f1ec494 100755 --- a/extra/sitemap/generate_sitemap_indices.py +++ b/extra/sitemap/generate_sitemap_indices.py @@ -10,7 +10,7 @@ def index_entity(entity_type, output): print("""""", file=output) print("""""", file=output) - for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"): + for filename in glob.glob(f"sitemap-{entity_type}-*.txt"): print(" ", file=output) print(f" https://scholar.archive.org/{filename}", file=output) print(f" {now}", file=output) diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh index 2971f47..98475ff 100755 --- a/extra/sitemap/work_urls_query.sh +++ b/extra/sitemap/work_urls_query.sh @@ -16,5 +16,3 @@ fatcat-cli search scholar 'doc_type:work (fulltext.access_type:ia_file OR fullte | tr '_' '/' \ | awk '{print "https://scholar.archive.org/" $1}' \ | split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt - -gzip sitemap-works-*.txt -- cgit v1.2.3