34 files changed, 1406 insertions, 7 deletions
diff --git a/extra/bulk_edits/2022-03-08_chocula.md b/extra/bulk_edits/2022-03-08_chocula.md
new file mode 100644
index 00000000..1877a236
--- /dev/null
+++ b/extra/bulk_edits/2022-03-08_chocula.md
@@ -0,0 +1,31 @@
+
+Periodic import of chocula metadata updates.
+
+## Prod Import
+
+    date
+    # Wed Mar  9 02:13:55 UTC 2022
+
+    git log -n1
+    # commit 72e3825893ae614fcd6c6ae8a513745bfefe36b2
+
+    export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...]
+    head -n100 /srv/fatcat/datasets/chocula_fatcat_export.2022-03-08.json | ./fatcat_import.py chocula --do-updates -
+    # Counter({'total': 100, 'exists': 85, 'exists-skip-update': 85, 'update': 14, 'insert': 1, 'skip': 0})
+
+Some of these are just "as of" date updates on DOAJ metadata, but most are
+"good". Lots of KBART holding dates incremented by a year (to include 2022).
+
+    time cat /srv/fatcat/datasets/chocula_fatcat_export.2022-03-08.json | ./fatcat_import.py chocula --do-updates -
+
+
+    Counter({'total': 184950, 'exists': 151925, 'exists-skip-update': 151655, 'update': 29953, 'insert': 3072
+    , 'exists-by-issnl': 270, 'skip': 0})
+
+    real    11m7.011s
+    user    4m48.705s
+    sys     0m16.761s
+
+Great!
+
+Now update stats, following `extra/container_count_update/README.md`.
diff --git a/extra/bulk_edits/2022-03-08_doaj.md b/extra/bulk_edits/2022-03-08_doaj.md
new file mode 100644
index 00000000..fc6438d5
--- /dev/null
+++ b/extra/bulk_edits/2022-03-08_doaj.md
@@ -0,0 +1,23 @@
+
+Simple periodic update of DOAJ article-level metadata.
+
+    cat doaj_article_data_*/article_batch*.json | jq .[] -c | pv -l | gzip > doaj_article_data_2021-05-25_all.json.gz
+    => 6.1M 0:18:45 [5.42k/s]
+    => 7.26M 0:30:45 [3.94k/s]
+
+    export FATCAT_AUTH_WORKER_DOAJ=...
+    cat /srv/fatcat/tasks/doaj_article_data_2022-03-07_sample_10k.json | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+    # Counter({'total': 10000, 'exists': 8827, 'exists-fuzzy': 944, 'insert': 219, 'skip': 8, 'skip-title': 8, 'skip-doaj-id-mismatch': 2, 'update': 0})
+
+    zcat /srv/fatcat/tasks/doaj_article_data_2022-03-07_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+
+The above seemed to use too much CPU, and caused a brief outage. Very high CPU
+use for just the python import processes, for whatever reason. Turned down
+parallelism and trying again:
+
+    zcat /srv/fatcat/tasks/doaj_article_data_2022-03-07_all.json.gz | pv -l | parallel -j6 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+    # multiple counts of:
+    # Counter({'total': 1196313, 'exists': 1055412, 'exists-fuzzy': 111490, 'insert': 27835, 'skip': 1280, 'skip-title': 1280, 'skip-doaj-id-mismatch': 296, 'update': 0})
+    # estimated only 167,010 new entities
+
+Then did a follow-up sandcrawler ingest, see notes in that repository.
diff --git a/extra/bulk_edits/2022-04-07_initial_datasets.md b/extra/bulk_edits/2022-04-07_initial_datasets.md
new file mode 100644
index 00000000..90827a38
--- /dev/null
+++ b/extra/bulk_edits/2022-04-07_initial_datasets.md
@@ -0,0 +1,22 @@
+
+Importing fileset and file entities from initial sandcrawler ingests.
+
+Git commit: `ede98644a89afd15d903061e0998dbd08851df6d`
+
+Filesets:
+
+    export FATCAT_AUTH_SANDCRAWLER=[...]
+    cat /tmp/ingest_dataset_combined_results.2022-04-04.partial.json \
+        | ./fatcat_import.py ingest-fileset-results -
+    # editgroup_5l47i7bscvfmpf4ddytauoekea
+    # Counter({'total': 195, 'skip': 176, 'skip-hit': 160, 'insert': 19, 'skip-single-file': 14, 'skip-partial-file-info': 2, 'update': 0, 'exists': 0})
+
+    cat /srv/fatcat/datasets/ingest_dataset_combined_results.2022-04-04.partial.json \
+        | ./fatcat_import.py ingest-fileset-file-results -
+    # editgroup_i2k2ucon7nap3gui3z7amuiug4
+    # Counter({'total': 195, 'skip': 184, 'skip-hit': 160, 'skip-status': 24, 'insert': 11, 'update': 0, 'exists': 0})
+
+Tried running again, to ensure that there are not duplicate inserts, and that
+worked ('exists' instead of 'insert' counts).
+
+Finally!
diff --git a/extra/bulk_edits/2022-04-20_isiarticles.md b/extra/bulk_edits/2022-04-20_isiarticles.md
new file mode 100644
index 00000000..b0177a46
--- /dev/null
+++ b/extra/bulk_edits/2022-04-20_isiarticles.md
@@ -0,0 +1,39 @@
+
+See metadata cleanups for context. Basically a couple tens of thousands of sample/spam articles hosted on the domain isiarticles.com.
+
+## Prod Updates
+
+Start small:
+
+    export FATCAT_API_HOST=https://api.fatcat.wiki
+    export FATCAT_AUTH_WORKER_CLEANUP=[...]
+    export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP
+
+    fatcat-cli search file domain:isiarticles.com --entity-json -n0 \
+        | rg -v '"content_scope"' \
+        | rg 'isiarticles.com/' \
+        | head -n50 \
+        | pv -l \
+        | fatcat-cli batch update file release_ids= content_scope=sample --description 'Un-link and mark isiarticles PDFs as content_scope=sample' --auto-accept
+    # editgroup_ihx75kzsebgzfisgjrv67zew5e
+
+The full batch:
+
+    fatcat-cli search file domain:isiarticles.com --entity-json -n0 \
+        | rg -v '"content_scope"' \
+        | rg 'isiarticles.com/' \
+        | pv -l \
+        | fatcat-cli batch update file release_ids= content_scope=sample --description 'Un-link and mark isiarticles PDFs as content_scope=sample' --auto-accept
+
+And some more with ':80' in the URL:
+
+    fatcat-cli search file domain:isiarticles.com '!content_scope:*' --entity-json -n0 \
+        | rg -v '"content_scope"' \
+        | rg 'isiarticles.com:80/' \
+        | pv -l \
+        | fatcat-cli batch update file release_ids= content_scope=sample --description 'Un-link and mark isiarticles PDFs as content_scope=sample' --auto-accept
+
+Verify:
+
+    fatcat-cli search file domain:isiarticles.com '!content_scope:*' --count
+    0
diff --git a/extra/bulk_edits/2022-07-06_chocula.md b/extra/bulk_edits/2022-07-06_chocula.md
new file mode 100644
index 00000000..86bf36fb
--- /dev/null
+++ b/extra/bulk_edits/2022-07-06_chocula.md
@@ -0,0 +1,25 @@
+
+Periodic import of chocula metadata updates.
+
+## Prod Import
+
+    date
+    # Wed Jul  6 23:29:47 UTC 2022
+
+    git log -n1
+    # aff3f40a5177dd6de4eee8ea7bca78df7a595bf3
+
+    export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...]
+    head -n100 /srv/fatcat/datasets/chocula_fatcat_export.2022-07-06.json | ./fatcat_import.py chocula --do-updates -
+    # Counter({'total': 100, 'exists': 86, 'exists-skip-update': 83, 'update': 13, 'exists-by-issnl': 3, 'insert': 1, 'skip': 0})
+
+Many updates are just KBART holding dates or DOAJ as-of dates, but that is fine
+and expected.
+
+    time cat /srv/fatcat/datasets/chocula_fatcat_export.2022-07-06.json | ./fatcat_import.py chocula --do-updates -
+    # Counter({'total': 187480, 'exists': 155943, 'exists-skip-update': 151171, 'update': 30437, 'exists-by-issnl': 4772, 'insert': 1100, 'skip': 0})
+    # real    10m28.081s
+    # user    4m37.447s
+    # sys     0m16.063s
+
+Now update stats, following `extra/container_count_update/README.md`.
diff --git a/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md b/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md
new file mode 100644
index 00000000..b17e799d
--- /dev/null
+++ b/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md
@@ -0,0 +1,38 @@
+
+There is a batch of about 480 releases with DOAJ identifiers but no container
+linkage. These seem to all be from the same actual container:
+
+    fatcat-cli search releases 'doaj_id:*' '!container_id:*' --count
+    # 486
+
+    fatcat-cli search releases 'doaj_id:*' '!container_id:*' --index-json -n 0 | jq .containe
+    # Got 486 hits in 138ms
+    # "Revista de Sistemas, Cibernética e Informática"
+
+Edit pipeline:
+
+    export FATCAT_AUTH_WORKER_CLEANUP=[...]
+    export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP
+
+    # start small
+    fatcat-cli search releases 'doaj_id:*' '!container_id:*' 'journal:Cibernética' --entity-json --limit 50 \
+        | jq 'select(.container_id == null)' -c \
+        | rg 'Cibernética' \
+        | fatcat-cli batch update release container_id=ubwuhr4obzgr7aadszhurhef5m --description "Add container linkage for DOAJ articles with ISSN 1690-8627"
+    # editgroup_g2zrm3wkmneoldtqfxpbkaoeh4
+
+Looks good, merged.
+
+    # full auto
+    fatcat-cli search releases 'doaj_id:*' '!container_id:*' 'journal:Cibernética' --entity-json --limit 500 \
+        | jq 'select(.container_id == null)' -c \
+        | rg 'Cibernética' \
+        | fatcat-cli batch update release container_id=ubwuhr4obzgr7aadszhurhef5m --description "Add container linkage for DOAJ articles with ISSN 1690-8627" --auto-accept
+
+Verify:
+
+    fatcat-cli search releases 'doaj_id:*' '!container_id:*' --count
+    # 0
+
+Also planning to have DOAJ article importer 'skip' in the future for articles
+with no `container_id` match.
diff --git a/extra/bulk_edits/2022-07-12_jalc.md b/extra/bulk_edits/2022-07-12_jalc.md
new file mode 100644
index 00000000..d9f09fee
--- /dev/null
+++ b/extra/bulk_edits/2022-07-12_jalc.md
@@ -0,0 +1,47 @@
+
+Import of a 2022-04 JALC DOI metadata snapshot.
+
+Note that we had downloaded a prior 2021-04 snapshot, but don't seem to have
+ever imported it.
+
+## Download and Archive
+
+URL for bulk snapshot is available at the bottom of this page: <https://form.jst.go.jp/enquetes/jalcmetadatadl_1703>
+
+More info: <http://japanlinkcenter.org/top/service/service_data.html>
+
+    wget 'https://japanlinkcenter.org/lod/JALC-LOD-20220401.gz?jalcmetadatadl_1703'
+    wget 'http://japanlinkcenter.org/top/doc/JaLC_LOD_format.pdf'
+    wget 'http://japanlinkcenter.org/top/doc/JaLC_LOD_sample.pdf'
+
+    mv 'JALC-LOD-20220401.gz?jalcmetadatadl_1703' JALC-LOD-20220401.gz
+
+    ia upload jalc-bulk-metadata-2022-04 -m collection:ia_biblio_metadata jalc_logo.png JALC-LOD-20220401.gz JaLC_LOD_format.pdf JaLC_LOD_sample.pdf
+
+## Import
+
+As of 2022-07-19, 6,502,202 release hits for `doi_registrar:jalc`.
+
+Re-download the file:
+
+    cd /srv/fatcat/datasets
+    wget 'https://archive.org/download/jalc-bulk-metadata-2022-04/JALC-LOD-20220401.gz'
+    gunzip JALC-LOD-20220401.gz
+    cd /srv/fatcat/src/python
+
+    wc -l /srv/fatcat/datasets/JALC-LOD-20220401
+    9525225
+
+Start with some samples:
+
+    export FATCAT_AUTH_WORKER_JALC=[...]
+    shuf -n100 /srv/fatcat/datasets/JALC-LOD-20220401 | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+    # Counter({'total': 100, 'exists': 89, 'insert': 11, 'skip': 0, 'update': 0})
+
+Full import (single threaded):
+
+    cat /srv/fatcat/datasets/JALC-LOD-20220401 | pv -l | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt
+    # 9.53M 22:26:06 [ 117 /s]
+    # Counter({'total': 9510096, 'exists': 8589731, 'insert': 915032, 'skip': 5333, 'inserted.container': 119, 'update': 0})
+
+Wow, almost a million new releases! 7,417,245 results for `doi_registrar:jalc`.
diff --git a/extra/bulk_edits/2022-07-12_orcid.md b/extra/bulk_edits/2022-07-12_orcid.md
new file mode 100644
index 00000000..760a16c8
--- /dev/null
+++ b/extra/bulk_edits/2022-07-12_orcid.md
@@ -0,0 +1,64 @@
+
+Annual ORCID import, using 2021 public data file. Didn't do this last year, so
+a catch-up, and will need to do another update later in 2022 (presumably in
+November/December).
+
+Not sure how many records this year. Current count on the orcid.org website is
+over 14 million ORCIDs, in July 2022.
+
+Files download from:
+
+- <https://info.orcid.org/orcids-2021-public-data-file-is-now-available>
+- <https://orcid.figshare.com/articles/dataset/ORCID_Public_Data_File_2021/16750535>
+- <https://archive.org/details/orcid-dump-2021>
+
+## Prep
+
+    ia upload orcid-dump-2021 -m collection:ia_biblio_metadata ORCID_2021_10_* orcid-logo.png
+
+    wget https://github.com/ORCID/orcid-conversion-lib/raw/master/target/orcid-conversion-lib-3.0.7-full.jar
+
+    java -jar orcid-conversion-lib-3.0.7-full.jar --tarball -i ORCID_2021_10_summaries.tar.gz -v v3_0 -o ORCID_2021_10_summaries_json.tar.gz
+
+    tar xvf ORCID_2021_10_summaries_json.tar.gz
+
+    fd .json ORCID_2021_10_summaries/ | parallel cat {} | jq . -c | pv -l | gzip > ORCID_2021_10_summaries.json.gz
+    # 12.6M 27:59:25 [ 125 /s]
+
+    zcat ORCID_2021_10_summaries.json.gz | shuf -n10000 | gzip > ORCID_2021_10_summaries.sample_10k.json.gz
+
+    ia upload orcid-dump-2021 ORCID_2021_10_summaries.json.gz ORCID_2021_10_summaries.sample_10k.json.gz
+
+## Import
+
+Fetch to prod machine:
+
+    wget https://archive.org/download/orcid-dump-2021/ORCID_2021_10_summaries.json.gz
+    wget https://archive.org/download/orcid-dump-2021/ORCID_2021_10_summaries.sample_10k.json.gz
+
+Sample:
+
+    export FATCAT_AUTH_WORKER_ORCID=[...]
+    zcat /srv/fatcat/datasets/ORCID_2021_10_summaries.sample_10k.json.gz | ./fatcat_import.py orcid -
+    # in 2020: Counter({'total': 10000, 'exists': 7356, 'insert': 2465, 'skip': 179, 'update': 0})
+    # this time: Counter({'total': 10000, 'exists': 7577, 'insert': 2191, 'skip': 232, 'update': 0})
+
+Bulk import:
+
+    export FATCAT_AUTH_WORKER_ORCID=[...]
+    time zcat /srv/fatcat/datasets/ORCID_2021_10_summaries.json.gz | pv -l | parallel -j8 --round-robin --pipe ./fatcat_import.py orcid -
+    12.6M 1:24:04 [2.51k/s]
+    Counter({'total': 1574111, 'exists': 1185437, 'insert': 347039, 'skip': 41635, 'update': 0})
+    Counter({'total': 1583157, 'exists': 1193341, 'insert': 348187, 'skip': 41629, 'update': 0})
+    Counter({'total': 1584441, 'exists': 1193385, 'insert': 349424, 'skip': 41632, 'update': 0})
+    Counter({'total': 1575971, 'exists': 1187270, 'insert': 347190, 'skip': 41511, 'update': 0})
+    Counter({'total': 1577323, 'exists': 1188892, 'insert': 346759, 'skip': 41672, 'update': 0})
+    Counter({'total': 1586719, 'exists': 1195610, 'insert': 349115, 'skip': 41994, 'update': 0})
+    Counter({'total': 1578484, 'exists': 1189423, 'insert': 347276, 'skip': 41785, 'update': 0})
+    Counter({'total': 1578728, 'exists': 1190316, 'insert': 346445, 'skip': 41967, 'update': 0})
+
+    real    84m5.297s
+    user    436m26.428s
+    sys     41m36.959s
+
+Roughly 2.7 million new ORCIDs, great!
diff --git a/extra/bulk_edits/2022-07-13_dblp.md b/extra/bulk_edits/2022-07-13_dblp.md
new file mode 100644
index 00000000..25405132
--- /dev/null
+++ b/extra/bulk_edits/2022-07-13_dblp.md
@@ -0,0 +1,114 @@
+
+## Prep
+
+    2022-07-13 05:24:33 (177 KB/s) - ‘dblp.xml.gz’ saved [715701831/715701831]
+
+    Counter({'total': 9186263, 'skip': 9186263, 'has-doi': 4960506, 'skip-key-type': 3037457, 'skip-arxiv-corr': 439104, 'skip-title': 1, 'insert': 0, 'update': 0, 'exists': 0})
+    5.71M 3:37:38 [ 437 /s]
+
+    7.48k 0:38:18 [3.25 /s]
+
+
+## Container Import
+
+Run 2022-07-15, after a database backup/snapshot.
+
+    export FATCAT_AUTH_WORKER_DBLP=[...]
+    ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json
+    # Got 5310 existing dblp container mappings.
+    # Counter({'total': 7471, 'exists': 7130, 'insert': 341, 'skip': 0, 'update': 0})
+
+    wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt
+       5310 existing_dblp_containers.tsv
+      12782 all_dblp_containers.tsv
+       7471 dblp_container_meta.json
+       7476 prefix_list.txt
+
+
+## Release Import
+
+    export FATCAT_AUTH_WORKER_DBLP=[...]
+    ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml
+    # Got 7480 dblp container mappings.
+
+    /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/gg/X90 ident=gfvkxubvsfdede7ps4af3oa34q
+    warnings.warn(warn_str)
+    /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/visalg/X88 ident=lvfyrd3lvva3hjuaaokzyoscmm
+    warnings.warn(warn_str)
+    /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/msr/PerumaANMO22 ident=2grlescl2bcpvd5yoc4npad3bm
+    warnings.warn(warn_str)
+    /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/dagstuhl/Brodlie97 ident=l6nh222fpjdzfotchu7vfjh6qu
+    warnings.warn(warn_str)
+    /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=series/gidiss/2018 ident=x6t7ze4z55enrlq2dnac4qqbve
+
+    Counter({'total': 9186263, 'exists': 5356574, 'has-doi': 4960506, 'skip': 3633039, 'skip-key-type': 3037457, 'skip-arxiv-corr': 439104, 'exists-fuzzy': 192376, 'skip-dblp-container-missing': 156477, 'insert': 4216, 'skip-arxiv': 53, 'skip-dblp-id-mismatch': 5, 'skip-title': 1, 'update': 0})
+
+NOTE: had to re-try in the middle, so these counts not accurate overall.
+
+Seems like a large number of `skip-dblp-container-missing`. Maybe should have
+re-generated that file differently?
+
+After this import there are 2,217,670 releases with a dblp ID, and 478,983 with
+a dblp ID and no DOI.
+
+
+## Sandcrawler Seedlist Generation
+
+Almost none of the ~487k dblp releases with no DOI have an associated file.
+This implies that no ingest has happened yet, even though the fatcat importer
+does parse and filter the "fulltext" URLs out of dblp records.
+
+    cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz
+    # 631k 0:02:39 [3.96k/s]
+
+    zcat dblp_sandcrawler_ingest_requests.json.gz | jq -r .base_url | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n25
+      43851 ceur-ws.org
+      33638 aclanthology.org
+      32077 aisel.aisnet.org
+      31017 ieeexplore.ieee.org
+      26426 dl.acm.org
+      23817 hdl.handle.net
+      22400 www.isca-speech.org
+      20072 tel.archives-ouvertes.fr
+      18609 www.aaai.org
+      18244 eprint.iacr.org
+      15720 ethos.bl.uk
+      14727 nbn-resolving.org
+      14470 proceedings.mlr.press
+      14095 dl.gi.de
+      12159 proceedings.neurips.cc
+      10890 knowledge.amia.org
+      10049 www.usenix.org
+       9675 papers.nips.cc
+       7541 subs.emis.de
+       7396 openaccess.thecvf.com
+       7345 mindmodeling.org
+       6574 ojs.aaai.org
+       5814 www.lrec-conf.org
+       5773 search.ndltd.org
+       5311 ijcai.org
+
+This is the first ingest, so let's do some sampling in the 'daily' queue:
+
+    zcat dblp_sandcrawler_ingest_requests.json.gz
+
+    zcat dblp_sandcrawler_ingest_requests.json.gz | shuf -n100 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Looks like we can probably get away with doing these in the daily ingest queue,
+instead of bulk? Try a larger batch:
+
+    zcat dblp_sandcrawler_ingest_requests.json.gz | shuf -n10000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Nope, these are going to need bulk ingest then follow-up crawling. Will
+heritrix crawl along with JALC and DOAJ stuff.
+
+    zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 631k 0:00:11 [54.0k/s]
+
+
+TODO:
+x python or jq transform of JSON objects
+x filter out german book/library URLs
+x ensure fatcat importer will actually import dblp matches
+x test with a small batch in daily or priority queue
+- enqueue all in bulk mode, even if processed before? many probably MAG or OAI-PMH previously
diff --git a/extra/bulk_edits/2022-07-19_doaj.md b/extra/bulk_edits/2022-07-19_doaj.md
new file mode 100644
index 00000000..d25f2dda
--- /dev/null
+++ b/extra/bulk_edits/2022-07-19_doaj.md
@@ -0,0 +1,78 @@
+
+Doing a batch import of DOAJ articles. Will need to do another one of these
+soon after setting up daily (OAI-PMH feed) ingest.
+
+## Prep
+
+    wget https://doaj.org/csv
+    wget https://doaj.org/public-data-dump/journal
+    wget https://doaj.org/public-data-dump/article
+
+    mv csv journalcsv__doaj_20220719_2135_utf8.csv
+    mv journal doaj_journal_data_2022-07-19.tar.gz
+    mv article doaj_article_data_2022-07-19.tar.gz
+
+    ia upload doaj_data_2022-07-19 -m collection:ia_biblio_metadata ../logo_cropped.jpg journalcsv__doaj_20220719_2135_utf8.csv doaj_journal_data_2022-07-19.tar.gz doaj_article_data_2022-07-19.tar.gz
+
+    tar xvf doaj_journal_data_2022-07-19.tar.gz
+    cat doaj_journal_data_*/journal_batch_*.json | jq .[] -c | pv -l | gzip > doaj_journal_data_2022-07-19_all.json.gz
+
+    tar xvf doaj_article_data_2022-07-19.tar.gz
+    cat doaj_article_data_*/article_batch*.json | jq .[] -c | pv -l | gzip > doaj_article_data_2022-07-19_all.json.gz
+
+    ia upload doaj_data_2022-07-19 doaj_journal_data_2022-07-19_all.json.gz doaj_article_data_2022-07-19_all.json.gz
+
+On fatcat machine:
+
+    cd /srv/fatcat/datasets
+    wget https://archive.org/download/doaj_data_2022-07-19/doaj_article_data_2022-07-19_all.json.gz
+
+## Prod Article Import
+
+    git rev: 582495f66e5e08b6e257360097807711e53008d4
+    (includes DOAJ container-id required patch)
+
+    date: Tue Jul 19 22:46:42 UTC 2022
+
+    `doaj_id:*`: 1,335,195 hits
+
+Start with sample:
+
+    zcat /srv/fatcat/datasets/doaj_article_data_2022-07-19_all.json.gz | shuf -n1000 > /srv/fatcat/datasets/doaj_article_data_2022-07-19_sample.json
+
+    export FATCAT_AUTH_WORKER_DOAJ=[...]
+    cat /srv/fatcat/datasets/doaj_article_data_2022-07-19_sample.json | pv -l | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+    # Counter({'total': 1000, 'exists': 895, 'exists-fuzzy': 93, 'insert': 9, 'skip': 3, 'skip-no-container': 3, 'update': 0})
+
+Pretty few imports.
+
+Full ingest:
+
+    export FATCAT_AUTH_WORKER_DOAJ=[...]
+    zcat /srv/fatcat/datasets/doaj_article_data_2022-07-19_all.json.gz | pv -l | parallel -j6 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+    # Counter({'total': 1282908, 'exists': 1145439, 'exists-fuzzy': 117120, 'insert': 16357, 'skip': 3831, 'skip-no-container': 2641, 'skip-title': 1190, 'skip-doaj-id-mismatch': 161, 'update': 0})
+
+Times 6x, around 100k releases added.
+
+Got a bunch of:
+
+    /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=fcdb7a7a9729403d8d99a21f6970dd1d ident=wesvmjwihvblzayfmrvvgr4ulm
+    warnings.warn(warn_str)
+    /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=1455dfe24583480883dbbb293a4bc0c6 ident=lfw57esesjbotms3grvvods5dq
+    warnings.warn(warn_str)
+    /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=88fa65a33c8e484091fc76f4cda59c25 ident=22abqt5qe5e7ngjd5fkyvzyc4q
+    warnings.warn(warn_str)
+    /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=eb7b03dc3dc340cea36891a68a50cce7 ident=ljedohlfyzdkxebgpcswjtd77q
+    warnings.warn(warn_str)
+    /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=519617147ce248ea88d45ab098342153 ident=a63bqkttrbhyxavfr7li2w2xf4
+
+Should investigate!
+
+Also, noticed that DOAJ importer is hitting `api.fatcat.wiki`, not the public
+API endpoint. Guessing this is via fuzzycat.
+
+1,434,266 results for `doaj_id:*`.
+
+Then did a follow-up sandcrawler ingest, see notes in that repository. Note
+that newer ingest can crawl doaj.org, bypassing the sandcrawler SQL load, but
+the direct crawling is probably still faster.
diff --git a/extra/bulk_edits/2022-07-29_chocula.md b/extra/bulk_edits/2022-07-29_chocula.md
new file mode 100644
index 00000000..1f6f36ca
--- /dev/null
+++ b/extra/bulk_edits/2022-07-29_chocula.md
@@ -0,0 +1,47 @@
+
+Periodic import of chocula metadata updates.
+
+In particular, expecting a bunch of `publisher_type` updates.
+
+Going to explicitly not do DOAJ-only updates this time around. That is, if
+container would have been updated, then new DOAJ 'extra' metadata will pass
+through. But don't only update entity for this reason. This is to reduce churn
+based only on the `as-of` key. Should probably change the behavior next time
+around.
+
+## Prod Import
+
+    date
+    # Sat Jul 30 01:18:41 UTC 2022
+
+    git log -n1
+    # 5ecf72cbb488a9a50eb869ea55b4c2bfc1440731
+
+    diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py
+    index 38802bcb..762c44dd 100644
+    --- a/python/fatcat_tools/importers/chocula.py
+    +++ b/python/fatcat_tools/importers/chocula.py
+    @@ -139,7 +139,7 @@ class ChoculaImporter(EntityImporter):
+             if ce.extra.get("publisher_type") and not ce.extra.get("publisher_type"):
+                 # many older containers were missing this metadata
+                 do_update = True
+    -        for k in ("kbart", "ia", "doaj"):
+    +        for k in ("kbart", "ia"):
+                 # always update these fields if not equal (chocula override)
+                 if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k):
+                     do_update = True
+
+    export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...]
+    shuf -n100 /srv/fatcat/datasets/chocula_fatcat_export.2022-07-30.json | ./fatcat_import.py chocula --do-updates -
+    # Counter({'total': 100, 'exists': 98, 'exists-skip-update': 98, 'update': 2, 'skip': 0, 'insert': 0})
+
+    shuf -n1000 /srv/fatcat/datasets/chocula_fatcat_export.2022-07-30.json | ./fatcat_import.py chocula --do-updates -
+    # Counter({'total': 1000, 'exists': 986, 'exists-skip-update': 986, 'update': 12, 'insert': 2, 'skip': 0})
+
+Huh, not seeing any `publisher_type` changes, which I was expecting more of.
+
+    time cat /srv/fatcat/datasets/chocula_fatcat_export.2022-07-30.json | ./fatcat_import.py chocula --do-updates -
+    # Counter({'total': 188506, 'exists': 185808, 'exists-skip-update': 185806, 'update': 2495, 'insert': 203, 'exists-by-issnl': 2, 'skip': 0})
+
+Looking through the changelog, some did through with `publisher_type` updates.
+Whew!
diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md
index 278dc1d8..716c95d6 100644
--- a/extra/bulk_edits/CHANGELOG.md
+++ b/extra/bulk_edits/CHANGELOG.md
@@ -9,6 +9,48 @@ this file should probably get merged into the guide at some point.
 
 This file should not turn in to a TODO list!
 
+## 2022-07
+
+Ran a journal-level metadata update, using chocula.
+
+Cleaned up just under 500 releases with missing `container_id` from an older
+DOAJ article import.
+
+Imported roughly 100k releases from DOAJ, new since 2022-04.
+
+Imported roughly 2.7 million new ORCiD `creator` entities, using the 2021 dump
+(first update since 2020 dump).
+
+Imported almost 1 million new DOI release entities from JALC, first update in
+more than a year.
+
+Imported at least 400 new dblp containers, and an unknown number of new dblp
+release entities.
+
+Cleaned up about a thousand containers with incorrect `publisher_type`, based
+on current publisher name. Further updates will populate after the next chocula
+import.
+
+Ran a second batch of journal-level metadata updates, from chocula, resulting
+in a couple thousand updated entities.
+
+
+## 2022-04
+
+Imported some initial fileset entities.
+
+Updated about 25k file entities from isiarticles.com, which are samples (spam
+for translation service) to remove release linkage and set
+`content_scope=sample` (similar to the springer "page one" case).
+
+## 2022-03
+
+Ran a journal-level metadata update, using chocula.
+
+Run a DOAJ article-level metadata import, yielding a couple hundred thousand
+new release entities. Crawling and bulk ingest of HTML and PDF fulltext for
+these articles also started.
+
 ## 2022-02
 
 - removed `container_id` linkage for some Datacite DOI releases which are
diff --git a/extra/cleanups/container_publisher_type.md b/extra/cleanups/container_publisher_type.md
new file mode 100644
index 00000000..dba800d3
--- /dev/null
+++ b/extra/cleanups/container_publisher_type.md
@@ -0,0 +1,100 @@
+
+A bunch of MDPI journals are incorrectly listed as 'longtail'.
+
+    fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --count
+    # 245
+
+Because this is 'extra' metadata, need a little python script to change the
+metadata (fatcat-cli doesn't have this feature yet):
+
+    import sys
+    import json
+
+    publisher_type = sys.argv[1].strip().lower()
+    #print(publisher_type, file=sys.stderr)
+
+    for line in sys.stdin:
+        if not line.strip():
+            continue
+        container = json.loads(line) 
+        container["extra"]["publisher_type"] = publisher_type
+        print(json.dumps(container))
+
+Run some cleanups:
+
+    export FATCAT_AUTH_WORKER_CLEANUP=[...]
+    export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP
+
+    fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --entity-json --limit 50 \
+        | jq 'select(.publisher_type != "oa")' -c \
+        | python3 ./container_publisher_type.py oa \
+        | fatcat-cli batch update container --description "Update container publisher_type"
+    # editgroup_oum6mnkl2rbn3jaua4a2gdlj5q
+
+Looks good, run the rest:
+
+    fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --entity-json --limit 300 \
+        | jq 'select(.publisher_type != "oa")' -c \
+        | python3 ./container_publisher_type.py oa \
+        | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept
+
+Some more cleanup patterns:
+
+    fatcat-cli search container 'publisher:"Frontiers Media SA" publisher_type:* !publisher_type:oa' --count
+    # 84
+
+    fatcat-cli search container 'publisher:"Frontiers Media SA" publisher_type:* !publisher_type:oa' --entity-json --limit 300 \
+        | jq 'select(.publisher_type != "oa")' -c \
+        | python3 ./container_publisher_type.py oa \
+        | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept
+
+    fatcat-cli search container 'publisher:"Walter de Gruyter" publisher_type:* !publisher_type:commercial !publisher_type:archive' --count
+    # 47
+
+    fatcat-cli search container 'publisher:"Walter de Gruyter" publisher_type:* !publisher_type:commercial !publisher_type:archive' --entity-json --limit 300 \
+        | jq 'select(.publisher_type != "commercial")' -c \
+        | python3 ./container_publisher_type.py commercial \
+        | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept
+
+    fatcat-cli search container 'publisher:"springer" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count
+    # 56
+
+    fatcat-cli search container 'publisher:"springer" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \
+        | jq 'select(.publisher_type != "big5")' -c \
+        | python3 ./container_publisher_type.py big5 \
+        | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept
+
+    fatcat-cli search container 'publisher:"elsevier" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count
+    # 98
+
+    fatcat-cli search container 'publisher:"elsevier" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \
+        | jq 'select(.publisher_type != "big5")' -c \
+        | python3 ./container_publisher_type.py big5 \
+        | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept
+
+    fatcat-cli search container 'publisher:"wiley" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count
+    # 37
+
+    fatcat-cli search container 'publisher:"wiley" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \
+        | jq 'select(.publisher_type != "big5")' -c \
+        | python3 ./container_publisher_type.py big5 \
+        | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept
+
+    fatcat-cli search container 'publisher:taylor publisher:francis publisher_type:* !publisher_type:big5 !publisher_type:archive' --count
+    # 558
+
+    fatcat-cli search container 'publisher:taylor publisher:francis publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \
+        | jq 'select(.publisher_type != "big5")' -c \
+        | python3 ./container_publisher_type.py big5 \
+        | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept
+
+    fatcat-cli search container 'publisher:sage publisher_type:* !publisher_type:big5 !publisher_type:archive' --count
+    # 28
+
+    fatcat-cli search container 'publisher:sage publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \
+        | jq 'select(.publisher_type != "big5")' -c \
+        | python3 ./container_publisher_type.py big5 \
+        | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept
+
+Overall, around a thousand containers updated. Changes to releases will not be
+reflected until they are re-indexed.
diff --git a/extra/cleanups/file_isiarticles.md b/extra/cleanups/file_isiarticles.md
new file mode 100644
index 00000000..3858361c
--- /dev/null
+++ b/extra/cleanups/file_isiarticles.md
@@ -0,0 +1,20 @@
+
+The domain isiarticles.com hosts a bunch of partial spam PDFs.
+
+As a first pass, we can remove these via the domain itself.
+
+A "blocklist" for this domain has been added to sandcrawler, so they should not
+get auto-ingested in the future.
+
+    # 2022-04-20
+    fatcat-cli search file domain:isiarticles.com --count
+    25067
+
+## Prod Cleanup
+
+See bulk edits log.
+
+Verify cleanup:
+
+    fatcat-cli search file domain:isiarticles.com '!content_scope:*' --count
+    0
diff --git a/extra/container_count_update/update_prod.sh b/extra/container_count_update/update_prod.sh
new file mode 100755
index 00000000..766398f2
--- /dev/null
+++ b/extra/container_count_update/update_prod.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+export CONTAINER_INDEX=fatcat_container_v05_20220110
+
+fatcat-cli search container --index-json --limit 0 state:active \
+    | jq .ident -r \
+    | pv -l \
+    > container_idents.tsv
+
+cat container_idents.tsv \
+    | parallel -j10 curl --fail -s 'https://fatcat.wiki/container/{}/stats.json' \
+    | jq -c . \
+    | pv -l \
+    > container_stats.json
+
+cat container_stats.json \
+    | jq '{ ident: .ident, releases_total: .total, preservation_bright: .preservation.bright, preservation_dark: .preservation.dark, preservation_shadows_only: .preservation.shadows_only, preservation_none: .preservation.none }' -c \
+    | esbulk -verbose -index $CONTAINER_INDEX -optype update -id ident
diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore
index a04dd76e..60774a12 100644
--- a/extra/dblp/.gitignore
+++ b/extra/dblp/.gitignore
@@ -4,3 +4,9 @@ series/
 Pipfile.lock
 *.json
 *.html
+*.txt
+*.dtd
+*.xml
+*.xml.gz
+*.tsv
+*.json.gz
diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile
index dbf86ac0..69705a3a 100644
--- a/extra/dblp/Pipfile
+++ b/extra/dblp/Pipfile
@@ -5,6 +5,7 @@ name = "pypi"
 
 [packages]
 selectolax = "*"
+urlcanon = "*"
 
 [dev-packages]
 
diff --git a/extra/dblp/README.md b/extra/dblp/README.md
index e6ccce4f..a95f7214 100644
--- a/extra/dblp/README.md
+++ b/extra/dblp/README.md
@@ -1,14 +1,51 @@
 
-This file describes hacks used to import dblp container metadata.
+This file describes hacks used to import dblp container and release metadata.
 
-As of December 2020 this is part of the dblp release metadata import pipeline:
-we must have conference and other non-ISSN containers created before running
-the release import. dblp does not publish container-level metadata in a
-structured format (eg, in their dumps), so scraping the HTML is unfortunately
-necessary.
+The container metadata must be processed and imported first, to create
+containers for non-ISSN venues. However, dblp only publishes structured
+metadata for articles (releases), not venues (containers), so we need to
+process the articles, then import the containers, then import the articles.
 
+There is a path that scrapes venue metadata out of dblp.org HTML.
 
-## Quick Bootstrap Commands
+
+## New Process (2022)
+
+Usually all of this gets run on a production fatcat instance. It may be
+possible to run parts elsewhere, but not confirmed, and would require copying
+some set of files around.
+
+    # remove any old/stale files
+    ./cleanup.sh
+
+    ./prep_container_metadata.sh
+
+This will take a while to run, after which the container metadata can be
+imported, like:
+
+    cd ../../python
+    pipenv shell
+    export FATCAT_AUTH_WORKER_DBLP=[...]
+    ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json
+
+Check that counts look sane:
+
+    wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt
+
+Then do release import like:
+
+    cd ../../python
+    pipenv shell
+    export FATCAT_AUTH_WORKER_DBLP=[...]
+    ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml
+
+Lastly, to generate sandcrawler ingest requests, from the JSON-dumped partial
+release objects::
+
+    cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz
+
+
+## [OLD] Manual Commands
 
 Set up a working directory somewhere:
 
diff --git a/extra/dblp/cleanup.sh b/extra/dblp/cleanup.sh
new file mode 100755
index 00000000..52e1a2ea
--- /dev/null
+++ b/extra/dblp/cleanup.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# run this as 'fatcat' user on a production machine
+
+rm -f dblp.dtd
+rm -f dblp.xml.gz
+rm -f dblp.xml
+rm -f dblp_releases_partial.json
+rm -f prefix_list.txt
+rm -f dblp_container_meta.json
+rm -f existing_dblp_containers.tsv
+rm -f all_dblp_containers.tsv
+
+rm -rf ./journals/
+rm -rf ./conf/
+rm -rf ./series/
+
diff --git a/extra/dblp/dblp2ingestrequest.py b/extra/dblp/dblp2ingestrequest.py
new file mode 100755
index 00000000..bdf5575d
--- /dev/null
+++ b/extra/dblp/dblp2ingestrequest.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""
+Transform a transformed, fatcat-like dblp object (JSON) into zero or more
+sandcrawler ingest requests.
+"""
+
+import argparse
+import json
+import sys
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+    # we crawl some of these directly via extid; others are just catalogs
+    "://arxiv.org/",
+    "://europepmc.org/",
+    #"://hdl.handle.net/",
+    "ncbi.nlm.nih.gov/",
+    "://doi.org/",
+    "zenodo.org/",
+    "figshare.com/",
+    "://d-nb.info/",
+    "://www.base-search.net/",
+]
+
+
+def canon(s):
+    parsed = urlcanon.parse_url(s)
+    return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj):
+    """
+    Transforms from a single object to zero or more ingest requests.
+    Returns a list of dicts.
+    """
+
+    requests = []
+    if not obj["ext_ids"].get("dblp"):
+        return requests
+    if not obj.get("_dblp_ee_urls"):
+        return requests
+
+    for url in obj["_dblp_ee_urls"]:
+        skip = False
+        for domain in DOMAIN_BLOCKLIST:
+            if domain in url:
+                skip = True
+        if skip:
+            continue
+        try:
+            base_url = canon(url)
+        except UnicodeEncodeError:
+            continue
+
+        request = {
+            "base_url": base_url,
+            "ingest_type": "pdf",
+            "link_source": "dblp",
+            "link_source_id": obj["ext_ids"]["dblp"],
+            "ingest_request_source": "dblp",
+            "release_stage": obj.get("release_stage") or None,
+            "ext_ids": {
+                "dblp": obj["ext_ids"]["dblp"],
+            },
+            "edit_extra": {},
+        }
+        requests.append(request)
+
+    return requests
+
+
+def run(args):
+    for l in args.json_file:
+        if not l.strip():
+            continue
+        row = json.loads(l)
+
+        requests = transform(row) or []
+        for r in requests:
+            print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "json_file", help="dblp transformed JSON file to use", type=argparse.FileType("r")
+    )
+    subparsers = parser.add_subparsers()
+
+    args = parser.parse_args()
+
+    run(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh
new file mode 100755
index 00000000..21a50ab0
--- /dev/null
+++ b/extra/dblp/prep_metadata.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# run this as 'fatcat' user on a production machine
+#export FATCAT_API_HOST="https://api.fatcat.wiki/v0"
+
+set -e -u -o pipefail
+
+# ensure deps
+#alias fd=fdfind
+fd -h > /dev/null
+fatcat-cli -h > /dev/null
+pipenv -h > /dev/null
+
+# ensure pipenv is ready
+pipenv install
+pipenv run true
+
+
+wget -c 'https://dblp.org/xml/dblp.dtd'
+wget -c 'https://dblp.org/xml/dblp.xml.gz'
+
+zcat dblp.xml.gz > dblp.xml
+
+cd ../../python
+pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json
+
+cd ../extra/dblp/
+
+cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt
+
+mkdir -p journals
+mkdir -p conf
+mkdir -p series
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+# clean up any failed/empty files, then re-run the above parallel/wget command
+find . -empty -type f -delete
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+find . -empty -type f -delete
+
+fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv
+
+cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz
diff --git a/extra/sql_dumps/Makefile b/extra/sql_dumps/Makefile
new file mode 100644
index 00000000..01607d34
--- /dev/null
+++ b/extra/sql_dumps/Makefile
@@ -0,0 +1,93 @@
+
+SHELL=/bin/bash -euo pipefail
+TODAY ?= $(shell date --iso --utc)
+DATADIR ?= /srv/fatcat/snapshots/$(TODAY)
+DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S)
+DATABASE_URL ?= fatcat_prod
+
+.PHONY: help
+help: ## Print info about all commands
+	@echo "Commands:"
+	@echo
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "    \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: create_datadir
+create_datadir:
+	mkdir -p $(DATADIR)/
+	sudo chmod a+rw $(DATADIR)/
+
+$(DATADIR)/.IDENTS:
+	sudo -u postgres DATABASE_URL=$(DATABASE_URL) ./ident_table_snapshot.sh $(DATADIR)
+	sudo -u postgres mv /tmp/fatcat_ident_*.tsv $(DATADIR)
+	touch $@
+
+$(DATADIR)/release_export_expanded.json.gz: $(DATADIR)/.IDENTS
+	cd ../../rust; cat $(DATADIR)/fatcat_ident_releases_by_work.tsv | sudo -u fatcat ./target/release/fatcat-export releasebywork --expand files,filesets,webcaptures,container -j8 | pigz > $@.wip
+	mv  $@.wip  $@
+
+$(DATADIR)/creator_export.json.gz: $(DATADIR)/.IDENTS
+	cd ../../rust; cat $(DATADIR)/fatcat_ident_creators.tsv | sudo -u fatcat ./target/release/fatcat-export creator -j8 | pigz > $@.wip
+	mv  $@.wip  $@
+
+$(DATADIR)/container_export.json.gz: $(DATADIR)/.IDENTS
+	cd ../../rust; cat $(DATADIR)/fatcat_ident_containers.tsv | sudo -u fatcat ./target/release/fatcat-export container -j8 | pigz > $@.wip
+	mv  $@.wip  $@
+
+$(DATADIR)/file_export.json.gz: $(DATADIR)/.IDENTS
+	cd ../../rust; cat $(DATADIR)/fatcat_ident_files.tsv | sudo -u fatcat ./target/release/fatcat-export file -j8 | pigz > $@.wip
+	mv  $@.wip  $@
+
+$(DATADIR)/fileset_export.json.gz: $(DATADIR)/.IDENTS
+	cd ../../rust; cat $(DATADIR)/fatcat_ident_filesets.tsv | sudo -u fatcat ./target/release/fatcat-export fileset -j8 | pigz > $@.wip
+	mv  $@.wip  $@
+
+$(DATADIR)/webcapture_export.json.gz: $(DATADIR)/.IDENTS
+	cd ../../rust; cat $(DATADIR)/fatcat_ident_webcaptures.tsv | sudo -u fatcat ./target/release/fatcat-export webcapture -j8 | pigz > $@.wip
+	mv  $@.wip  $@
+
+$(DATADIR)/abstracts.json.gz:
+	sudo -u postgres psql $(DATABASE_URL) < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | pigz > $@.wip
+	mv  $@.wip  $@
+
+$(DATADIR)/file_hashes.tsv.gz:
+	sudo -u postgres psql $(DATABASE_URL) < dump_file_hashes.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | pigz > $@.wip
+	mv  $@.wip  $@
+
+$(DATADIR)/release_extid.tsv.gz:
+	sudo -u postgres psql $(DATABASE_URL) < dump_release_extid.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | pigz > $@.wip
+	mv  $@.wip  $@
+
+$(DATADIR)/.METADATA_EXPORT: $(DATADIR)/.IDENTS $(DATADIR)/release_export_expanded.json.gz $(DATADIR)/creator_export.json.gz $(DATADIR)/container_export.json.gz $(DATADIR)/file_export.json.gz $(DATADIR)/fileset_export.json.gz $(DATADIR)/webcapture_export.json.gz $(DATADIR)/abstracts.json.gz $(DATADIR)/file_hashes.tsv.gz $(DATADIR)/release_extid.tsv.gz ## Dump bulk metadata to disk
+	touch $@
+
+.PHONY: metadata-exports
+metadata-exports: create_datadir $(DATADIR)/.METADATA_EXPORT  ## Dump bulk metadata to disk
+	@echo
+
+
+$(DATADIR)/.METADATA_UPLOADED: $(DATADIR)/.METADATA_EXPORT
+	ia upload --checksum fatcat_bulk_exports_$(TODAY) ia_exports_item_readme.md --remote-name=README.md -m collection:fatcat_snapshots_and_exports -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Fatcat Bulk Metadata Exports ($(TODAY))"
+	ia upload fatcat_bulk_exports_$(TODAY) $(DATADIR)/*_export.json.gz $(DATADIR)/*_export_expanded.json.gz $(DATADIR)/abstracts.json.gz $(DATADIR)/file_hashes.tsv.gz $(DATADIR)/release_extid.tsv.gz
+	touch $@
+
+.PHONY: upload-metadata-exports
+upload-metadata-exports: create_datadir $(DATADIR)/.METADATA_UPLOADED ## Upload bulk metadata exports to archive.org
+	@echo
+
+$(DATADIR)/.PUBLIC_DB_DUMP:
+	sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=auth_oidc fatcat_prod > $(DATADIR)/fatcat_public_dbdump_${DATESLUG}.pgdump.wip
+	mv $(DATADIR)/fatcat_public_dbdump_${DATESLUG}.pgdump.wip $(DATADIR)/fatcat_public_dbdump_${DATESLUG}.pgdump
+	touch $@
+
+.PHONY: public-database-snapshot
+public-database-snapshot: create_datadir $(DATADIR)/.PUBLIC_DB_DUMP ## Create SQL database snapshot which can be shared publicly
+	@echo
+
+$(DATADIR)/.PUBLIC_DB_UPLOADED: $(DATADIR)/.PUBLIC_DB_DUMP
+	ia upload --checksum fatcat_sqldump_public_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:fatcat_snapshots_and_exports -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Fatcat Public Database Snapshot ($(TODAY))"
+	ia upload --checksum fatcat_sqldump_public_$(TODAY) $(DATADIR)/fatcat_public_dbdump_*.pgdump
+	touch $@
+
+.PHONY: upload-public-database-snapshot
+upload-public-database-snapshot: create_datadir public-database-snapshot $(DATADIR)/.PUBLIC_DB_UPLOADED ## Upload metadata snapshot to archive.org
+	@echo
diff --git a/extra/stats/2022-03-21-prod-stats.json b/extra/stats/2022-03-21-prod-stats.json
new file mode 100644
index 00000000..4a82860f
--- /dev/null
+++ b/extra/stats/2022-03-21-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5850551,"timestamp":"2022-03-21T22:32:20.050613+00:00"}},"container":{"total":190480},"papers":{"in_kbart":77243855,"in_web":33064706,"in_web_not_kbart":15982780,"is_oa":24345482,"total":126701207},"release":{"refs_total":1269080199,"total":180472435}}
diff --git a/extra/stats/2022-03-21-prod-table-sizes.txt b/extra/stats/2022-03-21-prod-table-sizes.txt
new file mode 100644
index 00000000..328deec0
--- /dev/null
+++ b/extra/stats/2022-03-21-prod-table-sizes.txt
@@ -0,0 +1,47 @@
+PostgreSQL 13.5 - wbgrp-svc502.us.archive.org
+Size: 707.94G
+
+              table_name               | table_size | indexes_size | total_size 
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib"            | 86 GB      | 31 GB        | 117 GB
+ "public"."refs_blob"                  | 114 GB     | 2185 MB      | 116 GB
+ "public"."release_rev"                | 82 GB      | 25 GB        | 107 GB
+ "public"."file_rev"                   | 35 GB      | 29 GB        | 64 GB
+ "public"."release_edit"               | 18 GB      | 20 GB        | 38 GB
+ "public"."file_rev_url"               | 30 GB      | 7856 MB      | 37 GB
+ "public"."work_edit"                  | 17 GB      | 19 GB        | 36 GB
+ "public"."abstracts"                  | 33 GB      | 2829 MB      | 36 GB
+ "public"."file_edit"                  | 17 GB      | 15 GB        | 33 GB
+ "public"."release_ident"              | 12 GB      | 12 GB        | 23 GB
+ "public"."work_ident"                 | 11 GB      | 11 GB        | 23 GB
+ "public"."file_rev_release"           | 8709 MB    | 10 GB        | 19 GB
+ "public"."file_ident"                 | 7478 MB    | 7579 MB      | 15 GB
+ "public"."work_rev"                   | 7552 MB    | 5238 MB      | 12 GB
+ "public"."release_ref"                | 6486 MB    | 5199 MB      | 11 GB
+ "public"."release_rev_abstract"       | 4718 MB    | 5174 MB      | 9892 MB
+ "public"."webcapture_rev_cdx"         | 3491 MB    | 338 MB       | 3829 MB
+ "public"."creator_edit"               | 934 MB     | 1042 MB      | 1976 MB
+ "public"."creator_rev"                | 928 MB     | 730 MB       | 1658 MB
+ "public"."editgroup"                  | 1224 MB    | 252 MB       | 1476 MB
+ "public"."creator_ident"              | 631 MB     | 647 MB       | 1277 MB
+ "public"."release_rev_extid"          | 515 MB     | 641 MB       | 1157 MB
+ "public"."changelog"                  | 351 MB     | 297 MB       | 648 MB
+ "public"."container_rev"              | 228 MB     | 45 MB        | 273 MB
+ "public"."webcapture_edit"            | 66 MB      | 47 MB        | 113 MB
+ "public"."container_edit"             | 59 MB      | 54 MB        | 113 MB
+ "public"."webcapture_rev_url"         | 54 MB      | 20 MB        | 74 MB
+ "public"."webcapture_rev_release"     | 20 MB      | 35 MB        | 54 MB
+ "public"."webcapture_rev"             | 38 MB      | 14 MB        | 51 MB
+ "public"."webcapture_ident"           | 22 MB      | 27 MB        | 49 MB
+ "public"."container_ident"            | 13 MB      | 20 MB        | 33 MB
+ "public"."editor"                     | 88 kB      | 128 kB       | 216 kB
+ "public"."auth_oidc"                  | 88 kB      | 120 kB       | 208 kB
+ "public"."editgroup_annotation"       | 72 kB      | 48 kB        | 120 kB
+ "public"."fileset_rev_file"           | 48 kB      | 32 kB        | 80 kB
+ "public"."fileset_edit"               | 16 kB      | 48 kB        | 64 kB
+ "public"."fileset_rev_url"            | 16 kB      | 32 kB        | 48 kB
+ "public"."fileset_rev_release"        | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_ident"              | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_rev"                | 16 kB      | 16 kB        | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB        | 24 kB
+(41 rows)
diff --git a/extra/stats/2022-04-20-prod-stats.json b/extra/stats/2022-04-20-prod-stats.json
new file mode 100644
index 00000000..90c673d5
--- /dev/null
+++ b/extra/stats/2022-04-20-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5894575,"timestamp":"2022-04-20T04:19:07.676356+00:00"}},"container":{"total":191000},"papers":{"in_kbart":77487575,"in_web":33633575,"in_web_not_kbart":16531221,"is_oa":24576975,"total":127353365},"release":{"refs_total":1288004961,"total":181877321}}
diff --git a/extra/stats/2022-05-15-prod-stats.json b/extra/stats/2022-05-15-prod-stats.json
new file mode 100644
index 00000000..37c83f69
--- /dev/null
+++ b/extra/stats/2022-05-15-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5946444,"timestamp":"2022-05-15T21:40:19.956944+00:00"}},"container":{"total":191315},"papers":{"in_kbart":77657266,"in_web":34881556,"in_web_not_kbart":17596263,"is_oa":24739223,"total":127770175},"release":{"refs_total":1303116536,"total":182728162}}
diff --git a/extra/stats/2022-07-06-prod-stats.json b/extra/stats/2022-07-06-prod-stats.json
new file mode 100644
index 00000000..c93b4e0c
--- /dev/null
+++ b/extra/stats/2022-07-06-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":6011966,"timestamp":"2022-07-06T23:45:10.513758+00:00"}},"container":{"total":193199},"papers":{"in_kbart":78038966,"in_web":35774136,"in_web_not_kbart":18362815,"is_oa":25221089,"total":128843120},"release":{"refs_total":1335962190,"total":184701189}}
diff --git a/extra/stats/2022-07-06-prod-table-sizes.txt b/extra/stats/2022-07-06-prod-table-sizes.txt
new file mode 100644
index 00000000..01d205b1
--- /dev/null
+++ b/extra/stats/2022-07-06-prod-table-sizes.txt
@@ -0,0 +1,48 @@
+PostgreSQL 13.5 - wbgrp-svc502.us.archive.org
+Size: 732.62G
+
+              table_name               | table_size | indexes_size | total_size 
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib"            | 88 GB      | 32 GB        | 120 GB
+ "public"."refs_blob"                  | 118 GB     | 2196 MB      | 120 GB
+ "public"."release_rev"                | 85 GB      | 25 GB        | 110 GB
+ "public"."file_rev"                   | 36 GB      | 29 GB        | 65 GB
+ "public"."release_edit"               | 18 GB      | 21 GB        | 39 GB
+ "public"."file_rev_url"               | 31 GB      | 7999 MB      | 39 GB
+ "public"."abstracts"                  | 35 GB      | 3655 MB      | 38 GB
+ "public"."work_edit"                  | 17 GB      | 19 GB        | 37 GB
+ "public"."file_edit"                  | 18 GB      | 16 GB        | 34 GB
+ "public"."release_ident"              | 12 GB      | 12 GB        | 23 GB
+ "public"."work_ident"                 | 12 GB      | 11 GB        | 23 GB
+ "public"."file_rev_release"           | 8910 MB    | 10 GB        | 19 GB
+ "public"."file_ident"                 | 7704 MB    | 7605 MB      | 15 GB
+ "public"."work_rev"                   | 7741 MB    | 5238 MB      | 13 GB
+ "public"."release_ref"                | 6714 MB    | 5646 MB      | 12 GB
+ "public"."release_rev_abstract"       | 5015 MB    | 7213 MB      | 12 GB
+ "public"."webcapture_rev_cdx"         | 4340 MB    | 419 MB       | 4758 MB
+ "public"."creator_edit"               | 934 MB     | 1042 MB      | 1976 MB
+ "public"."creator_rev"                | 928 MB     | 730 MB       | 1658 MB
+ "public"."editgroup"                  | 1282 MB    | 256 MB       | 1537 MB
+ "public"."creator_ident"              | 631 MB     | 647 MB       | 1277 MB
+ "public"."release_rev_extid"          | 522 MB     | 648 MB       | 1170 MB
+ "public"."changelog"                  | 378 MB     | 301 MB       | 679 MB
+ "public"."container_rev"              | 249 MB     | 60 MB        | 308 MB
+ "public"."webcapture_edit"            | 82 MB      | 53 MB        | 135 MB
+ "public"."container_edit"             | 63 MB      | 69 MB        | 132 MB
+ "public"."webcapture_rev_url"         | 65 MB      | 22 MB        | 87 MB
+ "public"."webcapture_rev_release"     | 24 MB      | 35 MB        | 59 MB
+ "public"."webcapture_rev"             | 45 MB      | 14 MB        | 59 MB
+ "public"."webcapture_ident"           | 27 MB      | 27 MB        | 54 MB
+ "public"."container_ident"            | 13 MB      | 20 MB        | 34 MB
+ "public"."auth_oidc"                  | 104 kB     | 160 kB       | 264 kB
+ "public"."editor"                     | 96 kB      | 160 kB       | 256 kB
+ "public"."editgroup_annotation"       | 80 kB      | 48 kB        | 128 kB
+ "public"."fileset_rev_file"           | 88 kB      | 32 kB        | 120 kB
+ "public"."fileset_edit"               | 16 kB      | 48 kB        | 64 kB
+ "public"."fileset_rev_url"            | 16 kB      | 32 kB        | 48 kB
+ "public"."fileset_rev_release"        | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_ident"              | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_rev"                | 16 kB      | 16 kB        | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB        | 24 kB
+(41 rows)
+
diff --git a/extra/stats/2022-07-14-prod-stats.json b/extra/stats/2022-07-14-prod-stats.json
new file mode 100644
index 00000000..62d06606
--- /dev/null
+++ b/extra/stats/2022-07-14-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":6036957,"timestamp":"2022-07-14T18:53:18.228827+00:00"}},"container":{"total":193300},"papers":{"in_kbart":78102604,"in_web":36247601,"in_web_not_kbart":18551021,"is_oa":25281045,"total":128995907},"release":{"refs_total":1340195856,"total":184966214}}
diff --git a/extra/stats/2022-07-14-prod-table-sizes.txt b/extra/stats/2022-07-14-prod-table-sizes.txt
new file mode 100644
index 00000000..b4fae69a
--- /dev/null
+++ b/extra/stats/2022-07-14-prod-table-sizes.txt
@@ -0,0 +1,47 @@
+PostgreSQL 13.5 - wbgrp-svc502.us.archive.org
+Size: 735.11G
+
+              table_name               | table_size | indexes_size | total_size 
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib"            | 88 GB      | 32 GB        | 121 GB
+ "public"."refs_blob"                  | 119 GB     | 2200 MB      | 121 GB
+ "public"."release_rev"                | 85 GB      | 25 GB        | 110 GB
+ "public"."file_rev"                   | 36 GB      | 29 GB        | 65 GB
+ "public"."release_edit"               | 18 GB      | 21 GB        | 39 GB
+ "public"."file_rev_url"               | 31 GB      | 8106 MB      | 39 GB
+ "public"."abstracts"                  | 35 GB      | 3671 MB      | 39 GB
+ "public"."work_edit"                  | 17 GB      | 20 GB        | 37 GB
+ "public"."file_edit"                  | 18 GB      | 16 GB        | 34 GB
+ "public"."release_ident"              | 12 GB      | 12 GB        | 23 GB
+ "public"."work_ident"                 | 12 GB      | 11 GB        | 23 GB
+ "public"."file_rev_release"           | 8975 MB    | 10 GB        | 19 GB
+ "public"."file_ident"                 | 7775 MB    | 7615 MB      | 15 GB
+ "public"."work_rev"                   | 7753 MB    | 5238 MB      | 13 GB
+ "public"."release_ref"                | 6721 MB    | 5662 MB      | 12 GB
+ "public"."release_rev_abstract"       | 5035 MB    | 7250 MB      | 12 GB
+ "public"."webcapture_rev_cdx"         | 4341 MB    | 419 MB       | 4760 MB
+ "public"."creator_edit"               | 934 MB     | 1042 MB      | 1976 MB
+ "public"."creator_rev"                | 928 MB     | 730 MB       | 1658 MB
+ "public"."editgroup"                  | 1294 MB    | 256 MB       | 1550 MB
+ "public"."creator_ident"              | 631 MB     | 647 MB       | 1277 MB
+ "public"."release_rev_extid"          | 524 MB     | 649 MB       | 1173 MB
+ "public"."changelog"                  | 383 MB     | 301 MB       | 685 MB
+ "public"."container_rev"              | 249 MB     | 60 MB        | 308 MB
+ "public"."webcapture_edit"            | 82 MB      | 53 MB        | 135 MB
+ "public"."container_edit"             | 63 MB      | 69 MB        | 132 MB
+ "public"."webcapture_rev_url"         | 65 MB      | 22 MB        | 87 MB
+ "public"."webcapture_rev_release"     | 24 MB      | 35 MB        | 59 MB
+ "public"."webcapture_rev"             | 45 MB      | 14 MB        | 59 MB
+ "public"."webcapture_ident"           | 27 MB      | 27 MB        | 54 MB
+ "public"."container_ident"            | 13 MB      | 20 MB        | 34 MB
+ "public"."auth_oidc"                  | 104 kB     | 160 kB       | 264 kB
+ "public"."editor"                     | 96 kB      | 160 kB       | 256 kB
+ "public"."editgroup_annotation"       | 80 kB      | 48 kB        | 128 kB
+ "public"."fileset_rev_file"           | 88 kB      | 32 kB        | 120 kB
+ "public"."fileset_edit"               | 16 kB      | 48 kB        | 64 kB
+ "public"."fileset_rev_url"            | 16 kB      | 32 kB        | 48 kB
+ "public"."fileset_rev_release"        | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_ident"              | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_rev"                | 16 kB      | 16 kB        | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB        | 24 kB
+(41 rows)
diff --git a/extra/stats/2022-07-29-prod-stats.json b/extra/stats/2022-07-29-prod-stats.json
new file mode 100644
index 00000000..41d234ea
--- /dev/null
+++ b/extra/stats/2022-07-29-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":6143354,"timestamp":"2022-07-30T01:39:22.900415+00:00"}},"container":{"total":194187},"papers":{"in_kbart":78331069,"in_web":36847123,"in_web_not_kbart":18991886,"is_oa":25579026,"total":130376642},"release":{"refs_total":1350391488,"total":186556315}}
diff --git a/extra/stats/2022-07-29-prod-table-sizes.txt b/extra/stats/2022-07-29-prod-table-sizes.txt
new file mode 100644
index 00000000..cb85078f
--- /dev/null
+++ b/extra/stats/2022-07-29-prod-table-sizes.txt
@@ -0,0 +1,48 @@
+PostgreSQL 13.5 - wbgrp-svc502.us.archive.org
+Size: 748.75G
+
+              table_name               | table_size | indexes_size | total_size 
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib"            | 89 GB      | 33 GB        | 122 GB
+ "public"."refs_blob"                  | 119 GB     | 2218 MB      | 121 GB
+ "public"."release_rev"                | 86 GB      | 25 GB        | 111 GB
+ "public"."file_rev"                   | 36 GB      | 29 GB        | 65 GB
+ "public"."release_edit"               | 19 GB      | 21 GB        | 40 GB
+ "public"."file_rev_url"               | 31 GB      | 8306 MB      | 40 GB
+ "public"."abstracts"                  | 35 GB      | 3700 MB      | 39 GB
+ "public"."work_edit"                  | 17 GB      | 20 GB        | 37 GB
+ "public"."file_edit"                  | 19 GB      | 16 GB        | 34 GB
+ "public"."release_ident"              | 12 GB      | 12 GB        | 24 GB
+ "public"."work_ident"                 | 12 GB      | 11 GB        | 23 GB
+ "public"."file_rev_release"           | 9031 MB    | 10 GB        | 19 GB
+ "public"."file_ident"                 | 7837 MB    | 7624 MB      | 15 GB
+ "public"."work_rev"                   | 7823 MB    | 5243 MB      | 13 GB
+ "public"."release_ref"                | 6882 MB    | 6039 MB      | 13 GB
+ "public"."release_rev_abstract"       | 5100 MB    | 7327 MB      | 12 GB
+ "public"."webcapture_rev_cdx"         | 8090 MB    | 760 MB       | 8849 MB
+ "public"."creator_edit"               | 1203 MB    | 1919 MB      | 3122 MB
+ "public"."creator_rev"                | 1198 MB    | 1427 MB      | 2624 MB
+ "public"."creator_ident"              | 812 MB     | 1258 MB      | 2070 MB
+ "public"."editgroup"                  | 1325 MB    | 261 MB       | 1587 MB
+ "public"."release_rev_extid"          | 537 MB     | 668 MB       | 1204 MB
+ "public"."changelog"                  | 395 MB     | 307 MB       | 702 MB
+ "public"."container_rev"              | 251 MB     | 61 MB        | 312 MB
+ "public"."webcapture_edit"            | 144 MB     | 99 MB        | 242 MB
+ "public"."webcapture_rev_url"         | 113 MB     | 42 MB        | 155 MB
+ "public"."container_edit"             | 63 MB      | 71 MB        | 135 MB
+ "public"."webcapture_rev_release"     | 40 MB      | 70 MB        | 110 MB
+ "public"."webcapture_rev"             | 77 MB      | 27 MB        | 104 MB
+ "public"."webcapture_ident"           | 45 MB      | 54 MB        | 100 MB
+ "public"."container_ident"            | 13 MB      | 21 MB        | 34 MB
+ "public"."auth_oidc"                  | 104 kB     | 160 kB       | 264 kB
+ "public"."editor"                     | 96 kB      | 160 kB       | 256 kB
+ "public"."editgroup_annotation"       | 88 kB      | 48 kB        | 136 kB
+ "public"."fileset_rev_file"           | 88 kB      | 32 kB        | 120 kB
+ "public"."fileset_edit"               | 16 kB      | 48 kB        | 64 kB
+ "public"."fileset_rev_url"            | 16 kB      | 32 kB        | 48 kB
+ "public"."fileset_rev_release"        | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_ident"              | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_rev"                | 16 kB      | 16 kB        | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB        | 24 kB
+(41 rows)
+
diff --git a/extra/stats/2022-09-06-prod-stats.json b/extra/stats/2022-09-06-prod-stats.json
new file mode 100644
index 00000000..e7755f9f
--- /dev/null
+++ b/extra/stats/2022-09-06-prod-stats.json
@@ -0,0 +1,146 @@
+<!DOCTYPE html>
+<html lang="en" style="position: relative; min-height: 100%; height: auto;">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <meta name="referrer" content="origin-when-cross-origin">
+
+  <title>Stats | fatcat!</title>
+
+  <link rel="stylesheet"
+    type="text/css"
+    href="https://cdn.jsdelivr.net/npm/fomantic-ui@2.8.6/dist/semantic.min.css"
+    type="text/css"
+    crossorigin="anonymous">
+  <link rel="preload"
+    href="https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic&subset=latin&display=swap"
+    as="style">
+  <link rel="preload"
+    href="https://cdn.jsdelivr.net/npm/fomantic-ui@2.8.6/dist/themes/default/assets/fonts/icons.woff2"
+    as="font"
+    type="font/woff2"
+    crossorigin="anonymous">
+  <link rel="icon" href="data:,">
+
+  <style>
+    .ui.inverted.menu .item:before { background: none; }
+
+    @media only screen and (max-width: 479px) {
+      .mobile-hide{ display: none !important; }
+    }
+
+    .field textarea#toml {
+      font-family: monospace;
+    }
+  </style>
+</head>
+<body style="margin-bottom: 130px;">
+
+<header class="ui fixed inverted menu" style="height: 3em;">
+  <div class="ui container">
+    <a href="/" class="header item">
+      <!-- <img class="logo" src="assets/images/logo.png"> -->
+      fatcat!
+    </a>
+    <a href="/about" class="item mobile-hide">About</a>
+    <a href="https://guide.fatcat.wiki/" class="item mobile-hide">Guide</a>
+    <a href="/changelog" class="item mobile-hide">Changelog</a>
+    <div class="right menu">
+      <div class="item" style="padding: 0;">
+        <form class="" action="/search" method="get" role="search" aria-label="Papers">
+          <div class="ui transparent inverted icon input">
+            <i class="search icon" style="padding-right: 2em;"></i>
+            <input type="text" placeholder="Search Papers..." name="q" style="border: 1px solid #777 !important; padding: 5px !important; width: 15em;">
+          </div>
+        </form>
+      </div>
+      <div class="ui simple item">
+        <a href="/auth/login">Login/Signup</a>
+      </div>
+    </div>
+  </div>
+</header>
+
+
+<!-- 4em top margin is "enough" -->
+<main class="ui main container" style="margin-top: 6em; margin-bottom: 2em;" >
+  <div class="ui container text">
+
+<h1>Stats</h1>
+
+You can also fetch these numbers <a href="./stats.json">as JSON</a>.
+
+<h3>Changelog</h3>
+
+<p>Latest changelog index is 6205643 (2022-09-06).
+
+<h3>Entities</h3>
+
+<table class="ui structured table">
+  <tbody>
+    <tr style="padding-top:0px;"><td rowspan="5" class="active top aligned center aligned"><b>"Papers"</b></td>
+        <td>Total</td>
+        <td class="right aligned">131,082,821</td>
+    <tr>
+        <td>Fulltext on web</td>
+        <td class="right aligned">37,792,347</td>
+    <tr>
+        <td>"Gold" Open Access</td>
+        <td class="right aligned">25,854,622</td>
+    <tr>
+        <td>In a Keepers/KBART archive</td>
+        <td class="right aligned">78,632,782</td>
+    <tr>
+        <td>On web, not in Keepers</td>
+        <td class="right aligned">19,770,890</td>
+
+    <tr style="padding-top:0px;"><td rowspan="1" class="active top aligned center aligned"><b>Releases</b></td>
+        <td>Total</td>
+        <td class="right aligned">188,266,679</td>
+
+    <tr style="padding-top:0px;"><td rowspan="1" class="active top aligned center aligned"><b>Containers</b></td>
+        <td>Total</td>
+        <td class="right aligned">194,685</td>
+  </tbody>
+</table>
+
+<br>
+<i>"Papers" are journal articles and conference proceedings, a subset of Releases</i>
+
+  </div>
+</main>
+
+
+<footer class="ui inverted vertical footer segment" style="margin-top: 2em; padding-top: 2em; padding-bottom:2em; position: absolute; bottom: 0px; width: 100%;">
+  <div class="ui center aligned container">
+    <div class="ui horizontal inverted small divided link list">
+      <a class="item" href="/">fatcat!</a>
+      <a class="item" href="/about">About</a>
+      <a class="item" href="https://guide.fatcat.wiki/sources.html">Sources</a>
+      <a class="item" href="https://stats.uptimerobot.com/GM9YNSrB0">Status</a>
+      <a class="item" href="https://guide.fatcat.wiki/bulk_exports.html">Bulk Exports</a>
+      <a class="item" href="https://github.com/internetarchive/fatcat/">Code</a>
+      <a class="item" href="https://github.com/internetarchive/fatcat/tree/v0.5.1-109-g5ecf72cb"><code>v0.5.1-109-g5ecf72cb</code></a>
+    </div>
+  </div>
+</footer>
+
+<script
+  src="https://cdn.jsdelivr.net/npm/jquery@3.3.1/dist/jquery.min.js"
+  integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8="
+  crossorigin="anonymous">
+</script>
+<script
+  src="https://cdn.jsdelivr.net/npm/fomantic-ui@2.8.6/dist/semantic.min.js"
+  integrity="sha256-9H3HWYnPJ2bEHgkOrw+48KheOqYzTvJd1hbeU9sEDFk="
+  crossorigin="anonymous">
+</script>
+<script>
+    window.goatcounter = {
+        /* title leaks search query, so don't report it */
+        title: "",
+    };
+  </script>
+  <script data-goatcounter="/goatcounter/count" async src="/goatcounter/count.js"></script>
+</body>
+</html>
+\ No newline at end of file
diff --git a/extra/stats/2022-09-06-table-sizes.txt b/extra/stats/2022-09-06-table-sizes.txt
new file mode 100644
index 00000000..ddbd6842
--- /dev/null
+++ b/extra/stats/2022-09-06-table-sizes.txt
@@ -0,0 +1,48 @@
+
+PostgreSQL 13.5 - wbgrp-svc502.us.archive.org
+Size: 760.02G
+
+              table_name               | table_size | indexes_size | total_size 
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib"            | 90 GB      | 34 GB        | 124 GB
+ "public"."refs_blob"                  | 121 GB     | 2295 MB      | 123 GB
+ "public"."release_rev"                | 87 GB      | 26 GB        | 112 GB
+ "public"."file_rev"                   | 36 GB      | 30 GB        | 66 GB
+ "public"."file_rev_url"               | 32 GB      | 8778 MB      | 40 GB
+ "public"."release_edit"               | 19 GB      | 21 GB        | 40 GB
+ "public"."abstracts"                  | 36 GB      | 3726 MB      | 40 GB
+ "public"."work_edit"                  | 18 GB      | 20 GB        | 38 GB
+ "public"."file_edit"                  | 19 GB      | 16 GB        | 35 GB
+ "public"."release_ident"              | 12 GB      | 12 GB        | 24 GB
+ "public"."work_ident"                 | 12 GB      | 12 GB        | 23 GB
+ "public"."file_rev_release"           | 9100 MB    | 10 GB        | 19 GB
+ "public"."file_ident"                 | 7914 MB    | 7647 MB      | 15 GB
+ "public"."release_ref"                | 7012 MB    | 6486 MB      | 13 GB
+ "public"."work_rev"                   | 7900 MB    | 5280 MB      | 13 GB
+ "public"."release_rev_abstract"       | 5217 MB    | 7395 MB      | 12 GB
+ "public"."webcapture_rev_cdx"         | 9173 MB    | 862 MB       | 10035 MB
+ "public"."creator_edit"               | 1203 MB    | 1919 MB      | 3122 MB
+ "public"."creator_rev"                | 1198 MB    | 1427 MB      | 2624 MB
+ "public"."creator_ident"              | 812 MB     | 1258 MB      | 2070 MB
+ "public"."editgroup"                  | 1347 MB    | 272 MB       | 1620 MB
+ "public"."release_rev_extid"          | 540 MB     | 672 MB       | 1212 MB
+ "public"."changelog"                  | 406 MB     | 318 MB       | 724 MB
+ "public"."container_rev"              | 251 MB     | 62 MB        | 313 MB
+ "public"."webcapture_edit"            | 168 MB     | 105 MB       | 273 MB
+ "public"."webcapture_rev_url"         | 133 MB     | 44 MB        | 177 MB
+ "public"."container_edit"             | 63 MB      | 72 MB        | 135 MB
+ "public"."webcapture_rev_release"     | 48 MB      | 71 MB        | 119 MB
+ "public"."webcapture_rev"             | 91 MB      | 27 MB        | 118 MB
+ "public"."webcapture_ident"           | 54 MB      | 55 MB        | 108 MB
+ "public"."container_ident"            | 13 MB      | 21 MB        | 34 MB
+ "public"."editor"                     | 104 kB     | 168 kB       | 272 kB
+ "public"."auth_oidc"                  | 104 kB     | 160 kB       | 264 kB
+ "public"."editgroup_annotation"       | 88 kB      | 48 kB        | 136 kB
+ "public"."fileset_rev_file"           | 88 kB      | 32 kB        | 120 kB
+ "public"."fileset_edit"               | 16 kB      | 48 kB        | 64 kB
+ "public"."fileset_rev_url"            | 16 kB      | 32 kB        | 48 kB
+ "public"."fileset_rev_release"        | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_ident"              | 8192 bytes | 32 kB        | 40 kB
+ "public"."fileset_rev"                | 16 kB      | 16 kB        | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB        | 24 kB
+(41 rows)