From 6b1f87c12f7d40a3016910b214579a368c747df4 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Thu, 29 Apr 2021 10:03:47 -0700
Subject: sitemap generation

---
 extra/sitemap/.gitignore                  |  3 +++
 extra/sitemap/README.md                   | 21 +++++++++++++++++++++
 extra/sitemap/generate_sitemap_indices.py | 26 ++++++++++++++++++++++++++
 extra/sitemap/work_urls_query.sh          | 21 +++++++++++++++++++++
 4 files changed, 71 insertions(+)
 create mode 100644 extra/sitemap/.gitignore
 create mode 100644 extra/sitemap/README.md
 create mode 100755 extra/sitemap/generate_sitemap_indices.py
 create mode 100755 extra/sitemap/work_urls_query.sh

(limited to 'extra')
diff --git a/extra/sitemap/.gitignore b/extra/sitemap/.gitignore
new file mode 100644
index 0000000..5dd7dad
--- /dev/null
+++ b/extra/sitemap/.gitignore
@@ -0,0 +1,3 @@
+*.txt.gz
+*.xml
+*.json.gz
diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md
new file mode 100644
index 0000000..242378a
--- /dev/null
+++ b/extra/sitemap/README.md
@@ -0,0 +1,21 @@
+
+## HOWTO: Update
+
+Requires [fatcat-cli](https://gitlab.com/bnewbold/fatcat-cli) and `jq`
+installed. Run these commands on a production machine.
+
+    cd /srv/fatcat_scholar/sitemap
+    export DATE=`date --iso-8601`
+    /srv/fatcat_scholar/src/extra/sitemap/work_urls_query.sh $DATE
+    rm *.txt.gz
+    /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py
+
+## Background
+
+Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K
+lines / 50 MByte for XML site map files. Google Scholar has indicated a smaller
+20k URL / 5 MB limit.
+
+## Resources
+
+Google sitemap verifier: https://support.google.com/webmasters/answer/7451001
diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py
new file mode 100755
index 0000000..5b5cad2
--- /dev/null
+++ b/extra/sitemap/generate_sitemap_indices.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+import sys
+import glob
+import datetime
+
+def index_entity(entity_type, output):
+
+    now = datetime.date.today().isoformat()
+    print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output)
+    print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output)
+
+    for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"):
+        print("  <sitemap>", file=output)
+        print(f"    <loc>https://scholar.archive.org/{filename}</loc>", file=output)
+        print(f"    <lastmod>{now}</lastmod>", file=output)
+        print("  </sitemap>", file=output)
+
+    print("</sitemapindex>", file=output)
+
+def main():
+    with open('sitemap-index-works.xml', 'w') as output:
+        index_entity("works", output)
+
+if __name__=="__main__":
+    main()
diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh
new file mode 100755
index 0000000..c02eb74
--- /dev/null
+++ b/extra/sitemap/work_urls_query.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -e              # fail on error
+set -u              # fail if variable not set in substitution
+set -o pipefail     # fail if part of a '|' command fails
+
+: ${1?' You you did not supply a date argument'}
+
+# eg, 2020-08-19
+DATE="$1"
+
+# query for specific works
+
+fatcat-cli search scholar "doc_type:work (fulltext.access_type:ia_file OR fulltext.access_type:wayback) (year:<1925 OR publisher_type:longtail OR publisher_type:oa)" --index-json --limit 0 \
+    | pv -l \
+    | jq .key -r \
+    | tr '_' '/' \
+    | awk '{print "https://scholar.archive.org/" $1}' \
+    | split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt
+
+gzip sitemap-works-*.txt
-- 
cgit v1.2.3