summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/sitemap/.gitignore3
-rw-r--r--extra/sitemap/README.md21
-rwxr-xr-xextra/sitemap/generate_sitemap_indices.py26
-rwxr-xr-xextra/sitemap/work_urls_query.sh21
4 files changed, 71 insertions, 0 deletions
diff --git a/extra/sitemap/.gitignore b/extra/sitemap/.gitignore
new file mode 100644
index 0000000..5dd7dad
--- /dev/null
+++ b/extra/sitemap/.gitignore
@@ -0,0 +1,3 @@
+*.txt.gz
+*.xml
+*.json.gz
diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md
new file mode 100644
index 0000000..242378a
--- /dev/null
+++ b/extra/sitemap/README.md
@@ -0,0 +1,21 @@
+
+## HOWTO: Update
+
+Requires [fatcat-cli](https://gitlab.com/bnewbold/fatcat-cli) and `jq`
+installed. Run these commands on a production machine.
+
+ cd /srv/fatcat_scholar/sitemap
+ export DATE=`date --iso-8601`
+ /srv/fatcat_scholar/src/extra/sitemap/work_urls_query.sh $DATE
+ rm *.txt.gz
+ /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py
+
+## Background
+
+Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K
+lines / 50 MByte for XML site map files. Google Scholar has indicated a smaller
+20k URL / 5 MB limit.
+
+## Resources
+
+Google sitemap verifier: https://support.google.com/webmasters/answer/7451001
diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py
new file mode 100755
index 0000000..5b5cad2
--- /dev/null
+++ b/extra/sitemap/generate_sitemap_indices.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+import sys
+import glob
+import datetime
+
+def index_entity(entity_type, output):
+
+ now = datetime.date.today().isoformat()
+ print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output)
+ print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output)
+
+ for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"):
+ print(" <sitemap>", file=output)
+ print(f" <loc>https://scholar.archive.org/{filename}</loc>", file=output)
+ print(f" <lastmod>{now}</lastmod>", file=output)
+ print(" </sitemap>", file=output)
+
+ print("</sitemapindex>", file=output)
+
+def main():
+ with open('sitemap-index-works.xml', 'w') as output:
+ index_entity("works", output)
+
+if __name__=="__main__":
+ main()
diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh
new file mode 100755
index 0000000..c02eb74
--- /dev/null
+++ b/extra/sitemap/work_urls_query.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+: ${1?' You you did not supply a date argument'}
+
+# eg, 2020-08-19
+DATE="$1"
+
+# query for specific works
+
+fatcat-cli search scholar "doc_type:work (fulltext.access_type:ia_file OR fulltext.access_type:wayback) (year:<1925 OR publisher_type:longtail OR publisher_type:oa)" --index-json --limit 0 \
+ | pv -l \
+ | jq .key -r \
+ | tr '_' '/' \
+ | awk '{print "https://scholar.archive.org/" $1}' \
+ | split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt
+
+gzip sitemap-works-*.txt