diff options
| -rw-r--r-- | extra/sitemap/.gitignore | 3 | ||||
| -rw-r--r-- | extra/sitemap/README.md | 69 | ||||
| -rwxr-xr-x | extra/sitemap/container_url_lists.sh | 23 | ||||
| -rwxr-xr-x | extra/sitemap/generate_sitemap_indices.py | 28 | ||||
| -rwxr-xr-x | extra/sitemap/release_url_lists.sh | 29 | ||||
| -rw-r--r-- | python/fatcat_web/routes.py | 15 | ||||
| -rw-r--r-- | python/fatcat_web/static/robots.deny_all.txt | 7 | ||||
| -rw-r--r-- | python/fatcat_web/static/robots.txt | 19 | ||||
| -rw-r--r-- | python/fatcat_web/static/sitemap.xml | 13 | ||||
| -rw-r--r-- | python/fatcat_web/templates/home.html | 7 | 
10 files changed, 206 insertions, 7 deletions
| diff --git a/extra/sitemap/.gitignore b/extra/sitemap/.gitignore new file mode 100644 index 00000000..5dd7dadc --- /dev/null +++ b/extra/sitemap/.gitignore @@ -0,0 +1,3 @@ +*.txt.gz +*.xml +*.json.gz diff --git a/extra/sitemap/README.md b/extra/sitemap/README.md new file mode 100644 index 00000000..f72893cd --- /dev/null +++ b/extra/sitemap/README.md @@ -0,0 +1,69 @@ + +## HOWTO: Update + +After a container dump, as `fatcat` user on prod server: + +    cd /srv/fatcat/sitemap +    export DATE=`date --iso-8601` # or whatever +    /srv/fatcat/src/extra/sitemap/container_url_lists.sh $DATE /srv/fatcat/snapshots/container_export.json.gz +    /srv/fatcat/src/extra/sitemap/release_url_lists.sh $DATE /srv/fatcat/snapshots/release_export_expanded.json.gz +    # delete old sitemap url lists +    /srv/fatcat/src/extra/sitemap/generate_sitemap_indices.py + +## Background + +Google has a limit of 50k lines / 10 MByte for text sitemap files, and 50K +lines / 50 MByte for XML site map files. Google Scholar has indicated a smaller +20k URL / 5 MB limit. + +For the time being, we will include only a subset of fatcat entities and pages +in our sitemaps. + +- homepage, "about" pages +- all container landing pages (~150k) +- "best" release landing page for each work with fulltext (~25 million) + +In the short term, calculating "best" is tricky so let's just take the first +release with fulltext per work. + +In tree form: + +- `/robots.txt`: static file (in web app) +  - `/sitemap.xml`: about page, etc. static file (in web app) +  - `/sitemap-containers-index.xml`: points to .txt URL lists; generated by scripts +    - `/sitemap-containers-<date>-<shard>.txt` +  - `/sitemap-releases-index.xml`: same as above +    - `/sitemap-releases-<date>-<shard>.txt` + +Workflow: + +- run bash script over container dump, outputing compressed, sharded container sitemaps +- run bash script over release work-grouped, outputing compressed, sharded release sitemaps +- run python script to output top-level `sitemap.xml` +- `scp` all of this into place + +To make this work, will configure an nginx rule to point all requests like +`/sitemap-*` to the directory `/srv/fatcat/sitemap/`, and will collect output +there. + +## Ideas on Huge (complete) Index + +With a baseline of 100 million entities, that requires an index file pointing +to at least 2000x individual sitemaps. 3 hex characters is 12 bits, or 4096 +options; seems like an ok granularity to start with. + +Should look in to what archive.org does to generate their sitemap.xml, seems +simple, and comes in batches of exactly 50k. + +## Text Sitemaps + +Should be possible to create simple text-style sitemaps, one URL per line, and +link to these from a sitemap index. This is appealing because the sitemaps can +be generated very quickly from identifier SQL dump files, run through UNIX +commands (eg, to split and turn into URLs). Some script to create an XML +sitemap index to point at all the sitemaps would still be needed though. + + +## Resources + +Google sitemap verifier: https://support.google.com/webmasters/answer/7451001 diff --git a/extra/sitemap/container_url_lists.sh b/extra/sitemap/container_url_lists.sh new file mode 100755 index 00000000..fcc0f4b6 --- /dev/null +++ b/extra/sitemap/container_url_lists.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +set -e              # fail on error +set -u              # fail if variable not set in substitution +set -o pipefail     # fail if part of a '|' command fails + +: ${1?' You you did not supply a date argument'} +: ${2?' You you did not supply an input file (JSON gzip)'} +if [ ! -f $2 ] ; then +  echo "Input file not found: $2" && exit 1; +fi + +# eg, 2020-08-19 +DATE="$1" +# eg, container_export.json.gz +EXPORT_FILE_GZ="$2" + +zcat $EXPORT_FILE_GZ \ +    | jq .ident -r \ +    | awk '{print "https://fatcat.wiki/container/" $1 }' \ +    | split --lines 20000 - sitemap-containers-$DATE- -d -a 5 --additional-suffix .txt + +gzip sitemap-containers-*.txt diff --git a/extra/sitemap/generate_sitemap_indices.py b/extra/sitemap/generate_sitemap_indices.py new file mode 100755 index 00000000..0a5624a1 --- /dev/null +++ b/extra/sitemap/generate_sitemap_indices.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import sys +import glob +import datetime + +def index_entity(entity_type, output): + +    now = datetime.date.today().isoformat() +    print("""<?xml version="1.0" encoding="UTF-8"?>""", file=output) +    print("""<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">""", file=output) + +    for filename in glob.glob(f"sitemap-{entity_type}-*.txt.gz"): +        print("  <sitemap>", file=output) +        print(f"    <loc>https://fatcat.wiki/{filename}</loc>", file=output) +        print(f"    <lastmod>{now}</lastmod>", file=output) +        print("  </sitemap>", file=output) + +    print("</sitemapindex>", file=output) + +def main(): +    with open('sitemap-index-containers.xml', 'w') as output: +        index_entity("containers", output) +    with open('sitemap-index-releases.xml', 'w') as output: +        index_entity("releases", output) + +if __name__=="__main__": +    main() diff --git a/extra/sitemap/release_url_lists.sh b/extra/sitemap/release_url_lists.sh new file mode 100755 index 00000000..d5c8d4ef --- /dev/null +++ b/extra/sitemap/release_url_lists.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +set -e              # fail on error +set -u              # fail if variable not set in substitution +set -o pipefail     # fail if part of a '|' command fails + +: ${1?' You you did not supply a date argument'} +: ${2?' You you did not supply an input file (JSON gzip)'} +if [ ! -f $2 ] ; then +  echo "Input file not found: $2" && exit 1; +fi + +# eg, 2020-08-19 +DATE="$1" +# eg, release_export_expanded.json.gz +EXPORT_FILE_GZ="$2" + +# filter to fulltext releases only, then filter to only one hit per work +zcat $EXPORT_FILE_GZ \ +    | rg '"release_ids"' \ +    | rg 'archive.org/' \ +    | rg -v '"stub"' \ +    | jq -r '[.work_id, .ident] | @tsv' \ +    | uniq -w 26 \ +    | cut -f 2 \ +    | awk '{print "https://fatcat.wiki/release/" $1 }' \ +    | split --lines 20000 - sitemap-releases-$DATE- -d -a 5 --additional-suffix .txt + +gzip sitemap-releases-*.txt diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index da2bb6cf..9ae2eaa9 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -1157,7 +1157,18 @@ def page_rfc():      return render_template('rfc.html')  @app.route('/robots.txt', methods=['GET']) -def robots(): +def page_robots_txt(): +    if app.config['FATCAT_DOMAIN'] == "fatcat.wiki": +        robots_path = "robots.txt" +    else: +        robots_path = "robots.deny_all.txt"      return send_from_directory(os.path.join(app.root_path, 'static'), -                               'robots.txt', +                               robots_path,                                 mimetype='text/plain') + +@app.route('/sitemap.xml', methods=['GET']) +def page_sitemap_xml(): +    return send_from_directory(os.path.join(app.root_path, 'static'), +                               "sitemap.xml", +                               mimetype='text/xml') + diff --git a/python/fatcat_web/static/robots.deny_all.txt b/python/fatcat_web/static/robots.deny_all.txt new file mode 100644 index 00000000..b88274b1 --- /dev/null +++ b/python/fatcat_web/static/robots.deny_all.txt @@ -0,0 +1,7 @@ +# Hello friends! + +# You have found a QA/development instance of the Fatcat catalog. The canonical +# location is https://fatcat.wiki, please crawl and index that location instead. + +User-agent: * +Disallow: / diff --git a/python/fatcat_web/static/robots.txt b/python/fatcat_web/static/robots.txt index a168f11b..e89af36e 100644 --- a/python/fatcat_web/static/robots.txt +++ b/python/fatcat_web/static/robots.txt @@ -1 +1,20 @@  # Hello friends! +# If you are considering large or automated crawling, you may want to look at +# our API (https://api.fatcat.wiki) or bulk database snapshots instead. + +# by default, can crawl anything on this domain. HTTP 429 ("backoff") status +# codes are used for rate-limiting instead of any crawl delay specified here. +# Up to a handful concurrent requests should be fine. +User-agent: * +Allow: / + +# crawling search result pages is expensive, so we do specify a long crawl delay for those +User-agent: * +Allow: /release/search +Allow: /container/search +Allow: /coverage/search +Crawl-delay: 5 + +Sitemap: https://fatcat.wiki/sitemap.xml +Sitemap: https://fatcat.wiki/sitemap-index-releases.xml +Sitemap: https://fatcat.wiki/sitemap-index-containers.xml diff --git a/python/fatcat_web/static/sitemap.xml b/python/fatcat_web/static/sitemap.xml new file mode 100644 index 00000000..e6189aa4 --- /dev/null +++ b/python/fatcat_web/static/sitemap.xml @@ -0,0 +1,13 @@ +<?xml version="1.0" encoding="UTF-8"?> +<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> +  <!-- basic site pages --> +  <url><loc>https://fatcat.wiki/</loc></url> +  <url><loc>https://fatcat.wiki/about</loc></url> +  <url><loc>https://fatcat.wiki/rfc</loc></url> +  <url><loc>https://fatcat.wiki/stats</loc></url> +  <url><loc>https://fatcat.wiki/changelog</loc></url> +  <url><loc>https://fatcat.wiki/release/lookup</loc></url> +  <url><loc>https://fatcat.wiki/container/lookup</loc></url> +  <url><loc>https://fatcat.wiki/file/lookup</loc></url> +  <!-- additional entity-level URL lists are linked from robots.txt --> +</urlset> diff --git a/python/fatcat_web/templates/home.html b/python/fatcat_web/templates/home.html index de32d6a4..7ffa64ca 100644 --- a/python/fatcat_web/templates/home.html +++ b/python/fatcat_web/templates/home.html @@ -8,12 +8,9 @@  {% endblock %}  {% block fullmain %} -<!-- -<div class="ui container text" itemscope itemtype="https://schema.org/WebSite"> -<meta itemprop="url" content="https://{{ config.FATCAT_DOMAIN }}/"/> ---> -<div class ="ui vertical inverted masthead center aligned segment" style="padding-top: 12em; padding-bottom: 10em;"> +<div class ="ui vertical inverted masthead center aligned segment" style="padding-top: 12em; padding-bottom: 10em;" itemscope itemtype="https://schema.org/WebSite"> +  <link itemprop="url" content="https://{{ config.FATCAT_DOMAIN }}/"/>    <div class="ui text container">      <h1 class="ui header inverted huge centered">Perpetual Access to Millions of Open Research Publications From Around The World</h1>      <br> | 
