aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/sitemap/README.md2
-rwxr-xr-xextra/sitemap/generate_sitemap_indices.py2
-rwxr-xr-xextra/sitemap/work_urls_query.sh2
-rw-r--r--fatcat_scholar/static/robots.allow.txt4
-rw-r--r--fatcat_scholar/templates/work.html7
-rw-r--r--proposals/2021-04-28_indexability.md6
6 files changed, 11 insertions, 12 deletions
diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md
index 1e1938a..6c03095 100644
--- a/extra/sitemap/README.md
+++ b/extra/sitemap/README.md
@@ -7,8 +7,6 @@ installed. Run these commands on a production machine.
cd /srv/fatcat_scholar/sitemap
export DATE=`date --iso-8601`
/srv/fatcat_scholar/src/extra/sitemap/work_urls_query.sh $DATE
- rm *.txt.gz
- gzip sitemap-*.txt
/srv/fatcat_scholar/src/extra/sitemap/generate_sitemap_indices.py
## Background
diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py
index 5b5cad2..f1ec494 100755
--- a/extra/sitemap/generate_sitemap_indices.py
+++ b/extra/sitemap/generate_sitemap_indices.py
@@ -10,7 +10,7 @@ def index_entity(entity_type, output):
print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output)
print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output)
- for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"):
+ for filename in glob.glob(f"sitemap-{entity_type}-*.txt"):
print(" <sitemap>", file=output)
print(f" <loc>https://scholar.archive.org/{filename}</loc>", file=output)
print(f" <lastmod>{now}</lastmod>", file=output)
diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh
index 2971f47..98475ff 100755
--- a/extra/sitemap/work_urls_query.sh
+++ b/extra/sitemap/work_urls_query.sh
@@ -16,5 +16,3 @@ fatcat-cli search scholar 'doc_type:work (fulltext.access_type:ia_file OR fullte
| tr '_' '/' \
| awk '{print "https://scholar.archive.org/" $1}' \
| split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt
-
-gzip sitemap-works-*.txt
diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt
index 35f13a3..fb0c251 100644
--- a/fatcat_scholar/static/robots.allow.txt
+++ b/fatcat_scholar/static/robots.allow.txt
@@ -13,5 +13,5 @@ User-agent: *
Allow: /search
Crawl-delay: 5
-Sitemap: /sitemap.xml
-Sitemap: /sitemap-index-works.xml
+Sitemap: https://scholar.archive.org/sitemap.xml
+Sitemap: https://scholar.archive.org/sitemap-index-works.xml
diff --git a/fatcat_scholar/templates/work.html b/fatcat_scholar/templates/work.html
index 067d23c..8829ec2 100644
--- a/fatcat_scholar/templates/work.html
+++ b/fatcat_scholar/templates/work.html
@@ -18,7 +18,6 @@
{% if work.biblio.container_name %}
<meta name="citation_journal_title" content="{{ work.biblio.container_name }}">
{% endif %}
-
{% if work.biblio.volume %}
<meta name="citation_volume" content="{{ work.biblio.volume }}">
{% endif %}
@@ -32,9 +31,9 @@
<meta name="citation_doi" content="{{ work.biblio.doi }}">
{% endif %}
{% if work.fulltext.access_url and work.biblio.release_ident == work.fulltext.release_ident and work.fulltext.access_type in ['wayback', 'ia_file'] and work.fulltext.file_mimetype == "application/pdf" and work.fulltext.file_sha1 %}
-<!-- PDF access redirect URL, as requested by, eg, scholar.google.com -->
-<meta name="citation_pdf_url" content="/access-redirect/{{ work.fulltext.file_sha1 }}.pdf">
-<!-- <meta name="citation_pdf_url" content="{{ work.fulltext.access_url }}"> -->
+ <!-- PDF access redirect URL, as requested by, eg, scholar.google.com -->
+ <meta name="citation_pdf_url" content="https://scholar.archive.org/access-redirect/{{ work.fulltext.file_sha1 }}.pdf">
+ <!-- Multiple URLs allowed? <meta name="citation_pdf_url" content="{{ work.fulltext.access_url }}"> -->
{% endif %}
{% endblock %}
diff --git a/proposals/2021-04-28_indexability.md b/proposals/2021-04-28_indexability.md
index cfa928f..a58d23d 100644
--- a/proposals/2021-04-28_indexability.md
+++ b/proposals/2021-04-28_indexability.md
@@ -46,6 +46,8 @@ release will be linked, not earlier pre-print or accepted manuscript versions.
This behavior may change at some point to include "green" access links from the
"work" landing page.
+The `citation_pdf_url` tag should contain an absolute URL, not a relative URL.
+
Alternatively, we could have landing pages only for "releases" (versions), like
already exist on fatcat.wiki. This would make the decision about which files to
link to simpler.
@@ -101,6 +103,8 @@ will include:
/robots.txt - updated to include sitemap references
/sitemap.xml - basic generic list of pages (homepage, about, userguide)
/sitemap-index-works.xml - XML file pointing to many sub-sitemap files; includes lastmod metadata
- /sitemap-works-YYYY-MM-DD-NNNNN.txt.gz - series of timestamped "simple" sitemaps (URL list files)
+ /sitemap-works-YYYY-MM-DD-NNNNN.txt - series of timestamped "simple" sitemaps (URL list files)
Only works for which there is an appropriate fulltext access URL
+
+The sitemap links from robots.txt should be absolute URLs, not relative URLs.