diff options
-rw-r--r-- | extra/sitemap/README.md | 2 | ||||
-rwxr-xr-x | extra/sitemap/generate_sitemap_indices.py | 2 | ||||
-rwxr-xr-x | extra/sitemap/work_urls_query.sh | 2 | ||||
-rw-r--r-- | fatcat_scholar/static/robots.allow.txt | 4 | ||||
-rw-r--r-- | fatcat_scholar/templates/work.html | 7 | ||||
-rw-r--r-- | proposals/2021-04-28_indexability.md | 6 |
6 files changed, 11 insertions, 12 deletions
diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md index 1e1938a..6c03095 100644 --- a/extra/sitemap/README.md +++ b/extra/sitemap/README.md @@ -7,8 +7,6 @@ installed. Run these commands on a production machine. cd /srv/fatcat_scholar/sitemap export DATE=`date --iso-8601` /srv/fatcat_scholar/src/extra/sitemap/work_urls_query.sh $DATE - rm *.txt.gz - gzip sitemap-*.txt /srv/fatcat_scholar/src/extra/sitemap/generate_sitemap_indices.py ## Background diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py index 5b5cad2..f1ec494 100755 --- a/extra/sitemap/generate_sitemap_indices.py +++ b/extra/sitemap/generate_sitemap_indices.py @@ -10,7 +10,7 @@ def index_entity(entity_type, output): print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output) print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output) - for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"): + for filename in glob.glob(f"sitemap-{entity_type}-*.txt"): print(" <sitemap>", file=output) print(f" <loc>https://scholar.archive.org/{filename}</loc>", file=output) print(f" <lastmod>{now}</lastmod>", file=output) diff --git a/extra/sitemap/work_urls_query.sh b/extra/sitemap/work_urls_query.sh index 2971f47..98475ff 100755 --- a/extra/sitemap/work_urls_query.sh +++ b/extra/sitemap/work_urls_query.sh @@ -16,5 +16,3 @@ fatcat-cli search scholar 'doc_type:work (fulltext.access_type:ia_file OR fullte | tr '_' '/' \ | awk '{print "https://scholar.archive.org/" $1}' \ | split --lines 20000 - sitemap-works-$DATE- -d -a 5 --additional-suffix .txt - -gzip sitemap-works-*.txt diff --git a/fatcat_scholar/static/robots.allow.txt b/fatcat_scholar/static/robots.allow.txt index 35f13a3..fb0c251 100644 --- a/fatcat_scholar/static/robots.allow.txt +++ b/fatcat_scholar/static/robots.allow.txt @@ -13,5 +13,5 @@ User-agent: * Allow: /search Crawl-delay: 5 -Sitemap: /sitemap.xml -Sitemap: /sitemap-index-works.xml +Sitemap: https://scholar.archive.org/sitemap.xml +Sitemap: https://scholar.archive.org/sitemap-index-works.xml diff --git a/fatcat_scholar/templates/work.html b/fatcat_scholar/templates/work.html index 067d23c..8829ec2 100644 --- a/fatcat_scholar/templates/work.html +++ b/fatcat_scholar/templates/work.html @@ -18,7 +18,6 @@ {% if work.biblio.container_name %} <meta name="citation_journal_title" content="{{ work.biblio.container_name }}"> {% endif %} - {% if work.biblio.volume %} <meta name="citation_volume" content="{{ work.biblio.volume }}"> {% endif %} @@ -32,9 +31,9 @@ <meta name="citation_doi" content="{{ work.biblio.doi }}"> {% endif %} {% if work.fulltext.access_url and work.biblio.release_ident == work.fulltext.release_ident and work.fulltext.access_type in ['wayback', 'ia_file'] and work.fulltext.file_mimetype == "application/pdf" and work.fulltext.file_sha1 %} -<!-- PDF access redirect URL, as requested by, eg, scholar.google.com --> -<meta name="citation_pdf_url" content="/access-redirect/{{ work.fulltext.file_sha1 }}.pdf"> -<!-- <meta name="citation_pdf_url" content="{{ work.fulltext.access_url }}"> --> + <!-- PDF access redirect URL, as requested by, eg, scholar.google.com --> + <meta name="citation_pdf_url" content="https://scholar.archive.org/access-redirect/{{ work.fulltext.file_sha1 }}.pdf"> + <!-- Multiple URLs allowed? <meta name="citation_pdf_url" content="{{ work.fulltext.access_url }}"> --> {% endif %} {% endblock %} diff --git a/proposals/2021-04-28_indexability.md b/proposals/2021-04-28_indexability.md index cfa928f..a58d23d 100644 --- a/proposals/2021-04-28_indexability.md +++ b/proposals/2021-04-28_indexability.md @@ -46,6 +46,8 @@ release will be linked, not earlier pre-print or accepted manuscript versions. This behavior may change at some point to include "green" access links from the "work" landing page. +The `citation_pdf_url` tag should contain an absolute URL, not a relative URL. + Alternatively, we could have landing pages only for "releases" (versions), like already exist on fatcat.wiki. This would make the decision about which files to link to simpler. @@ -101,6 +103,8 @@ will include: /robots.txt - updated to include sitemap references /sitemap.xml - basic generic list of pages (homepage, about, userguide) /sitemap-index-works.xml - XML file pointing to many sub-sitemap files; includes lastmod metadata - /sitemap-works-YYYY-MM-DD-NNNNN.txt.gz - series of timestamped "simple" sitemaps (URL list files) + /sitemap-works-YYYY-MM-DD-NNNNN.txt - series of timestamped "simple" sitemaps (URL list files) Only works for which there is an appropriate fulltext access URL + +The sitemap links from robots.txt should be absolute URLs, not relative URLs. |