diff options
Diffstat (limited to 'extra/sitemap')
-rw-r--r-- | extra/sitemap/.gitignore | 3 | ||||
-rw-r--r-- | extra/sitemap/README.md | 21 | ||||
-rwxr-xr-x | extra/sitemap/generate_sitemap_indices.py | 26 | ||||
-rwxr-xr-x | extra/sitemap/work_urls_query.sh | 21 |
4 files changed, 71 insertions, 0 deletions
diff --git a/extra/sitemap/.gitignore b/extra/sitemap/.gitignore new file mode 100644 index 0000000..5dd7dad --- /dev/null +++ b/extra/sitemap/.gitignore @@ -0,0 +1,3 @@ +*.txt.gz +*.xml +*.json.gz diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md new file mode 100644 index 0000000..242378a --- /dev/null +++ b/extra/sitemap/README.md @@ -0,0 +1,21 @@ + +## HOWTO: Update + +Requires [fatcat-cli](https://gitlab.com/bnewbold/fatcat-cli) and `jq` +installed. Run these commands on a production machine. + + cd /srv/fatcat_scholar/sitemap + export DATE=`date --iso-8601` + /srv/fatcat_scholar/src/extra/sitemap/work_urls_query.sh $DATE + rm *.txt.gz + /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py + +## Background + +Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K +lines / 50 MByte for XML site map files. Google Scholar has indicated a smaller +20k URL / 5 MB limit. + +## Resources + +Google sitemap verifier: https://support.google.com/webmasters/answer/7451001 diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py new file mode 100755 index 0000000..5b5cad2 --- /dev/null +++ b/extra/sitemap/generate_sitemap_indices.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +import sys +import glob +import datetime + +def index_entity(entity_type, output): + + now = datetime.date.today().isoformat() + print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output) + print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output) + + for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"): + print(" <sitemap>", file=output) + print(f" <loc>https://scholar.archive.org/{filename}</loc>", file=output) + print(f" <lastmod>{now}</lastmod>", file=output) + print(" </sitemap>", file=output) + + print("</sitemapindex>", file=output) + +def main(): + with open('sitemap-index-works.xml', 'w') as output: + index_entity("works", output) + +if __name__=="__main__": + main() diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh new file mode 100755 index 0000000..c02eb74 --- /dev/null +++ b/extra/sitemap/work_urls_query.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +: ${1?' You you did not supply a date argument'} + +# eg, 2020-08-19 +DATE="$1" + +# query for specific works + +fatcat-cli search scholar "doc_type:work (fulltext.access_type:ia_file OR fulltext.access_type:wayback) (year:<1925 OR publisher_type:longtail OR publisher_type:oa)" --index-json --limit 0 \ + | pv -l \ + | jq .key -r \ + | tr '_' '/' \ + | awk '{print "https://scholar.archive.org/" $1}' \ + | split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt + +gzip sitemap-works-*.txt |