diff options
Diffstat (limited to 'extra')
34 files changed, 1406 insertions, 7 deletions
diff --git a/extra/bulk_edits/2022-03-08_chocula.md b/extra/bulk_edits/2022-03-08_chocula.md new file mode 100644 index 00000000..1877a236 --- /dev/null +++ b/extra/bulk_edits/2022-03-08_chocula.md @@ -0,0 +1,31 @@ + +Periodic import of chocula metadata updates. + +## Prod Import + + date + # Wed Mar 9 02:13:55 UTC 2022 + + git log -n1 + # commit 72e3825893ae614fcd6c6ae8a513745bfefe36b2 + + export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...] + head -n100 /srv/fatcat/datasets/chocula_fatcat_export.2022-03-08.json | ./fatcat_import.py chocula --do-updates - + # Counter({'total': 100, 'exists': 85, 'exists-skip-update': 85, 'update': 14, 'insert': 1, 'skip': 0}) + +Some of these are just "as of" date updates on DOAJ metadata, but most are +"good". Lots of KBART holding dates incremented by a year (to include 2022). + + time cat /srv/fatcat/datasets/chocula_fatcat_export.2022-03-08.json | ./fatcat_import.py chocula --do-updates - + + + Counter({'total': 184950, 'exists': 151925, 'exists-skip-update': 151655, 'update': 29953, 'insert': 3072 + , 'exists-by-issnl': 270, 'skip': 0}) + + real 11m7.011s + user 4m48.705s + sys 0m16.761s + +Great! + +Now update stats, following `extra/container_count_update/README.md`. diff --git a/extra/bulk_edits/2022-03-08_doaj.md b/extra/bulk_edits/2022-03-08_doaj.md new file mode 100644 index 00000000..fc6438d5 --- /dev/null +++ b/extra/bulk_edits/2022-03-08_doaj.md @@ -0,0 +1,23 @@ + +Simple periodic update of DOAJ article-level metadata. + + cat doaj_article_data_*/article_batch*.json | jq .[] -c | pv -l | gzip > doaj_article_data_2021-05-25_all.json.gz + => 6.1M 0:18:45 [5.42k/s] + => 7.26M 0:30:45 [3.94k/s] + + export FATCAT_AUTH_WORKER_DOAJ=... + cat /srv/fatcat/tasks/doaj_article_data_2022-03-07_sample_10k.json | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + # Counter({'total': 10000, 'exists': 8827, 'exists-fuzzy': 944, 'insert': 219, 'skip': 8, 'skip-title': 8, 'skip-doaj-id-mismatch': 2, 'update': 0}) + + zcat /srv/fatcat/tasks/doaj_article_data_2022-03-07_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + +The above seemed to use too much CPU, and caused a brief outage. Very high CPU +use for just the python import processes, for whatever reason. Turned down +parallelism and trying again: + + zcat /srv/fatcat/tasks/doaj_article_data_2022-03-07_all.json.gz | pv -l | parallel -j6 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + # multiple counts of: + # Counter({'total': 1196313, 'exists': 1055412, 'exists-fuzzy': 111490, 'insert': 27835, 'skip': 1280, 'skip-title': 1280, 'skip-doaj-id-mismatch': 296, 'update': 0}) + # estimated only 167,010 new entities + +Then did a follow-up sandcrawler ingest, see notes in that repository. diff --git a/extra/bulk_edits/2022-04-07_initial_datasets.md b/extra/bulk_edits/2022-04-07_initial_datasets.md new file mode 100644 index 00000000..90827a38 --- /dev/null +++ b/extra/bulk_edits/2022-04-07_initial_datasets.md @@ -0,0 +1,22 @@ + +Importing fileset and file entities from initial sandcrawler ingests. + +Git commit: `ede98644a89afd15d903061e0998dbd08851df6d` + +Filesets: + + export FATCAT_AUTH_SANDCRAWLER=[...] + cat /tmp/ingest_dataset_combined_results.2022-04-04.partial.json \ + | ./fatcat_import.py ingest-fileset-results - + # editgroup_5l47i7bscvfmpf4ddytauoekea + # Counter({'total': 195, 'skip': 176, 'skip-hit': 160, 'insert': 19, 'skip-single-file': 14, 'skip-partial-file-info': 2, 'update': 0, 'exists': 0}) + + cat /srv/fatcat/datasets/ingest_dataset_combined_results.2022-04-04.partial.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # editgroup_i2k2ucon7nap3gui3z7amuiug4 + # Counter({'total': 195, 'skip': 184, 'skip-hit': 160, 'skip-status': 24, 'insert': 11, 'update': 0, 'exists': 0}) + +Tried running again, to ensure that there are not duplicate inserts, and that +worked ('exists' instead of 'insert' counts). + +Finally! diff --git a/extra/bulk_edits/2022-04-20_isiarticles.md b/extra/bulk_edits/2022-04-20_isiarticles.md new file mode 100644 index 00000000..b0177a46 --- /dev/null +++ b/extra/bulk_edits/2022-04-20_isiarticles.md @@ -0,0 +1,39 @@ + +See metadata cleanups for context. Basically a couple tens of thousands of sample/spam articles hosted on the domain isiarticles.com. + +## Prod Updates + +Start small: + + export FATCAT_API_HOST=https://api.fatcat.wiki + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + fatcat-cli search file domain:isiarticles.com --entity-json -n0 \ + | rg -v '"content_scope"' \ + | rg 'isiarticles.com/' \ + | head -n50 \ + | pv -l \ + | fatcat-cli batch update file release_ids= content_scope=sample --description 'Un-link and mark isiarticles PDFs as content_scope=sample' --auto-accept + # editgroup_ihx75kzsebgzfisgjrv67zew5e + +The full batch: + + fatcat-cli search file domain:isiarticles.com --entity-json -n0 \ + | rg -v '"content_scope"' \ + | rg 'isiarticles.com/' \ + | pv -l \ + | fatcat-cli batch update file release_ids= content_scope=sample --description 'Un-link and mark isiarticles PDFs as content_scope=sample' --auto-accept + +And some more with ':80' in the URL: + + fatcat-cli search file domain:isiarticles.com '!content_scope:*' --entity-json -n0 \ + | rg -v '"content_scope"' \ + | rg 'isiarticles.com:80/' \ + | pv -l \ + | fatcat-cli batch update file release_ids= content_scope=sample --description 'Un-link and mark isiarticles PDFs as content_scope=sample' --auto-accept + +Verify: + + fatcat-cli search file domain:isiarticles.com '!content_scope:*' --count + 0 diff --git a/extra/bulk_edits/2022-07-06_chocula.md b/extra/bulk_edits/2022-07-06_chocula.md new file mode 100644 index 00000000..86bf36fb --- /dev/null +++ b/extra/bulk_edits/2022-07-06_chocula.md @@ -0,0 +1,25 @@ + +Periodic import of chocula metadata updates. + +## Prod Import + + date + # Wed Jul 6 23:29:47 UTC 2022 + + git log -n1 + # aff3f40a5177dd6de4eee8ea7bca78df7a595bf3 + + export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...] + head -n100 /srv/fatcat/datasets/chocula_fatcat_export.2022-07-06.json | ./fatcat_import.py chocula --do-updates - + # Counter({'total': 100, 'exists': 86, 'exists-skip-update': 83, 'update': 13, 'exists-by-issnl': 3, 'insert': 1, 'skip': 0}) + +Many updates are just KBART holding dates or DOAJ as-of dates, but that is fine +and expected. + + time cat /srv/fatcat/datasets/chocula_fatcat_export.2022-07-06.json | ./fatcat_import.py chocula --do-updates - + # Counter({'total': 187480, 'exists': 155943, 'exists-skip-update': 151171, 'update': 30437, 'exists-by-issnl': 4772, 'insert': 1100, 'skip': 0}) + # real 10m28.081s + # user 4m37.447s + # sys 0m16.063s + +Now update stats, following `extra/container_count_update/README.md`. diff --git a/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md b/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md new file mode 100644 index 00000000..b17e799d --- /dev/null +++ b/extra/bulk_edits/2022-07-12_cleanup_doaj_missing_container_id.md @@ -0,0 +1,38 @@ + +There is a batch of about 480 releases with DOAJ identifiers but no container +linkage. These seem to all be from the same actual container: + + fatcat-cli search releases 'doaj_id:*' '!container_id:*' --count + # 486 + + fatcat-cli search releases 'doaj_id:*' '!container_id:*' --index-json -n 0 | jq .containe + # Got 486 hits in 138ms + # "Revista de Sistemas, Cibernética e Informática" + +Edit pipeline: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + # start small + fatcat-cli search releases 'doaj_id:*' '!container_id:*' 'journal:Cibernética' --entity-json --limit 50 \ + | jq 'select(.container_id == null)' -c \ + | rg 'Cibernética' \ + | fatcat-cli batch update release container_id=ubwuhr4obzgr7aadszhurhef5m --description "Add container linkage for DOAJ articles with ISSN 1690-8627" + # editgroup_g2zrm3wkmneoldtqfxpbkaoeh4 + +Looks good, merged. + + # full auto + fatcat-cli search releases 'doaj_id:*' '!container_id:*' 'journal:Cibernética' --entity-json --limit 500 \ + | jq 'select(.container_id == null)' -c \ + | rg 'Cibernética' \ + | fatcat-cli batch update release container_id=ubwuhr4obzgr7aadszhurhef5m --description "Add container linkage for DOAJ articles with ISSN 1690-8627" --auto-accept + +Verify: + + fatcat-cli search releases 'doaj_id:*' '!container_id:*' --count + # 0 + +Also planning to have DOAJ article importer 'skip' in the future for articles +with no `container_id` match. diff --git a/extra/bulk_edits/2022-07-12_jalc.md b/extra/bulk_edits/2022-07-12_jalc.md new file mode 100644 index 00000000..d9f09fee --- /dev/null +++ b/extra/bulk_edits/2022-07-12_jalc.md @@ -0,0 +1,47 @@ + +Import of a 2022-04 JALC DOI metadata snapshot. + +Note that we had downloaded a prior 2021-04 snapshot, but don't seem to have +ever imported it. + +## Download and Archive + +URL for bulk snapshot is available at the bottom of this page: <https://form.jst.go.jp/enquetes/jalcmetadatadl_1703> + +More info: <http://japanlinkcenter.org/top/service/service_data.html> + + wget 'https://japanlinkcenter.org/lod/JALC-LOD-20220401.gz?jalcmetadatadl_1703' + wget 'http://japanlinkcenter.org/top/doc/JaLC_LOD_format.pdf' + wget 'http://japanlinkcenter.org/top/doc/JaLC_LOD_sample.pdf' + + mv 'JALC-LOD-20220401.gz?jalcmetadatadl_1703' JALC-LOD-20220401.gz + + ia upload jalc-bulk-metadata-2022-04 -m collection:ia_biblio_metadata jalc_logo.png JALC-LOD-20220401.gz JaLC_LOD_format.pdf JaLC_LOD_sample.pdf + +## Import + +As of 2022-07-19, 6,502,202 release hits for `doi_registrar:jalc`. + +Re-download the file: + + cd /srv/fatcat/datasets + wget 'https://archive.org/download/jalc-bulk-metadata-2022-04/JALC-LOD-20220401.gz' + gunzip JALC-LOD-20220401.gz + cd /srv/fatcat/src/python + + wc -l /srv/fatcat/datasets/JALC-LOD-20220401 + 9525225 + +Start with some samples: + + export FATCAT_AUTH_WORKER_JALC=[...] + shuf -n100 /srv/fatcat/datasets/JALC-LOD-20220401 | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + # Counter({'total': 100, 'exists': 89, 'insert': 11, 'skip': 0, 'update': 0}) + +Full import (single threaded): + + cat /srv/fatcat/datasets/JALC-LOD-20220401 | pv -l | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + # 9.53M 22:26:06 [ 117 /s] + # Counter({'total': 9510096, 'exists': 8589731, 'insert': 915032, 'skip': 5333, 'inserted.container': 119, 'update': 0}) + +Wow, almost a million new releases! 7,417,245 results for `doi_registrar:jalc`. diff --git a/extra/bulk_edits/2022-07-12_orcid.md b/extra/bulk_edits/2022-07-12_orcid.md new file mode 100644 index 00000000..760a16c8 --- /dev/null +++ b/extra/bulk_edits/2022-07-12_orcid.md @@ -0,0 +1,64 @@ + +Annual ORCID import, using 2021 public data file. Didn't do this last year, so +a catch-up, and will need to do another update later in 2022 (presumably in +November/December). + +Not sure how many records this year. Current count on the orcid.org website is +over 14 million ORCIDs, in July 2022. + +Files download from: + +- <https://info.orcid.org/orcids-2021-public-data-file-is-now-available> +- <https://orcid.figshare.com/articles/dataset/ORCID_Public_Data_File_2021/16750535> +- <https://archive.org/details/orcid-dump-2021> + +## Prep + + ia upload orcid-dump-2021 -m collection:ia_biblio_metadata ORCID_2021_10_* orcid-logo.png + + wget https://github.com/ORCID/orcid-conversion-lib/raw/master/target/orcid-conversion-lib-3.0.7-full.jar + + java -jar orcid-conversion-lib-3.0.7-full.jar --tarball -i ORCID_2021_10_summaries.tar.gz -v v3_0 -o ORCID_2021_10_summaries_json.tar.gz + + tar xvf ORCID_2021_10_summaries_json.tar.gz + + fd .json ORCID_2021_10_summaries/ | parallel cat {} | jq . -c | pv -l | gzip > ORCID_2021_10_summaries.json.gz + # 12.6M 27:59:25 [ 125 /s] + + zcat ORCID_2021_10_summaries.json.gz | shuf -n10000 | gzip > ORCID_2021_10_summaries.sample_10k.json.gz + + ia upload orcid-dump-2021 ORCID_2021_10_summaries.json.gz ORCID_2021_10_summaries.sample_10k.json.gz + +## Import + +Fetch to prod machine: + + wget https://archive.org/download/orcid-dump-2021/ORCID_2021_10_summaries.json.gz + wget https://archive.org/download/orcid-dump-2021/ORCID_2021_10_summaries.sample_10k.json.gz + +Sample: + + export FATCAT_AUTH_WORKER_ORCID=[...] + zcat /srv/fatcat/datasets/ORCID_2021_10_summaries.sample_10k.json.gz | ./fatcat_import.py orcid - + # in 2020: Counter({'total': 10000, 'exists': 7356, 'insert': 2465, 'skip': 179, 'update': 0}) + # this time: Counter({'total': 10000, 'exists': 7577, 'insert': 2191, 'skip': 232, 'update': 0}) + +Bulk import: + + export FATCAT_AUTH_WORKER_ORCID=[...] + time zcat /srv/fatcat/datasets/ORCID_2021_10_summaries.json.gz | pv -l | parallel -j8 --round-robin --pipe ./fatcat_import.py orcid - + 12.6M 1:24:04 [2.51k/s] + Counter({'total': 1574111, 'exists': 1185437, 'insert': 347039, 'skip': 41635, 'update': 0}) + Counter({'total': 1583157, 'exists': 1193341, 'insert': 348187, 'skip': 41629, 'update': 0}) + Counter({'total': 1584441, 'exists': 1193385, 'insert': 349424, 'skip': 41632, 'update': 0}) + Counter({'total': 1575971, 'exists': 1187270, 'insert': 347190, 'skip': 41511, 'update': 0}) + Counter({'total': 1577323, 'exists': 1188892, 'insert': 346759, 'skip': 41672, 'update': 0}) + Counter({'total': 1586719, 'exists': 1195610, 'insert': 349115, 'skip': 41994, 'update': 0}) + Counter({'total': 1578484, 'exists': 1189423, 'insert': 347276, 'skip': 41785, 'update': 0}) + Counter({'total': 1578728, 'exists': 1190316, 'insert': 346445, 'skip': 41967, 'update': 0}) + + real 84m5.297s + user 436m26.428s + sys 41m36.959s + +Roughly 2.7 million new ORCIDs, great! diff --git a/extra/bulk_edits/2022-07-13_dblp.md b/extra/bulk_edits/2022-07-13_dblp.md new file mode 100644 index 00000000..25405132 --- /dev/null +++ b/extra/bulk_edits/2022-07-13_dblp.md @@ -0,0 +1,114 @@ + +## Prep + + 2022-07-13 05:24:33 (177 KB/s) - ‘dblp.xml.gz’ saved [715701831/715701831] + + Counter({'total': 9186263, 'skip': 9186263, 'has-doi': 4960506, 'skip-key-type': 3037457, 'skip-arxiv-corr': 439104, 'skip-title': 1, 'insert': 0, 'update': 0, 'exists': 0}) + 5.71M 3:37:38 [ 437 /s] + + 7.48k 0:38:18 [3.25 /s] + + +## Container Import + +Run 2022-07-15, after a database backup/snapshot. + + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json + # Got 5310 existing dblp container mappings. + # Counter({'total': 7471, 'exists': 7130, 'insert': 341, 'skip': 0, 'update': 0}) + + wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt + 5310 existing_dblp_containers.tsv + 12782 all_dblp_containers.tsv + 7471 dblp_container_meta.json + 7476 prefix_list.txt + + +## Release Import + + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml + # Got 7480 dblp container mappings. + + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/gg/X90 ident=gfvkxubvsfdede7ps4af3oa34q + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/visalg/X88 ident=lvfyrd3lvva3hjuaaokzyoscmm + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/msr/PerumaANMO22 ident=2grlescl2bcpvd5yoc4npad3bm + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/dagstuhl/Brodlie97 ident=l6nh222fpjdzfotchu7vfjh6qu + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=series/gidiss/2018 ident=x6t7ze4z55enrlq2dnac4qqbve + + Counter({'total': 9186263, 'exists': 5356574, 'has-doi': 4960506, 'skip': 3633039, 'skip-key-type': 3037457, 'skip-arxiv-corr': 439104, 'exists-fuzzy': 192376, 'skip-dblp-container-missing': 156477, 'insert': 4216, 'skip-arxiv': 53, 'skip-dblp-id-mismatch': 5, 'skip-title': 1, 'update': 0}) + +NOTE: had to re-try in the middle, so these counts not accurate overall. + +Seems like a large number of `skip-dblp-container-missing`. Maybe should have +re-generated that file differently? + +After this import there are 2,217,670 releases with a dblp ID, and 478,983 with +a dblp ID and no DOI. + + +## Sandcrawler Seedlist Generation + +Almost none of the ~487k dblp releases with no DOI have an associated file. +This implies that no ingest has happened yet, even though the fatcat importer +does parse and filter the "fulltext" URLs out of dblp records. + + cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz + # 631k 0:02:39 [3.96k/s] + + zcat dblp_sandcrawler_ingest_requests.json.gz | jq -r .base_url | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n25 + 43851 ceur-ws.org + 33638 aclanthology.org + 32077 aisel.aisnet.org + 31017 ieeexplore.ieee.org + 26426 dl.acm.org + 23817 hdl.handle.net + 22400 www.isca-speech.org + 20072 tel.archives-ouvertes.fr + 18609 www.aaai.org + 18244 eprint.iacr.org + 15720 ethos.bl.uk + 14727 nbn-resolving.org + 14470 proceedings.mlr.press + 14095 dl.gi.de + 12159 proceedings.neurips.cc + 10890 knowledge.amia.org + 10049 www.usenix.org + 9675 papers.nips.cc + 7541 subs.emis.de + 7396 openaccess.thecvf.com + 7345 mindmodeling.org + 6574 ojs.aaai.org + 5814 www.lrec-conf.org + 5773 search.ndltd.org + 5311 ijcai.org + +This is the first ingest, so let's do some sampling in the 'daily' queue: + + zcat dblp_sandcrawler_ingest_requests.json.gz + + zcat dblp_sandcrawler_ingest_requests.json.gz | shuf -n100 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + +Looks like we can probably get away with doing these in the daily ingest queue, +instead of bulk? Try a larger batch: + + zcat dblp_sandcrawler_ingest_requests.json.gz | shuf -n10000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + +Nope, these are going to need bulk ingest then follow-up crawling. Will +heritrix crawl along with JALC and DOAJ stuff. + + zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 631k 0:00:11 [54.0k/s] + + +TODO: +x python or jq transform of JSON objects +x filter out german book/library URLs +x ensure fatcat importer will actually import dblp matches +x test with a small batch in daily or priority queue +- enqueue all in bulk mode, even if processed before? many probably MAG or OAI-PMH previously diff --git a/extra/bulk_edits/2022-07-19_doaj.md b/extra/bulk_edits/2022-07-19_doaj.md new file mode 100644 index 00000000..d25f2dda --- /dev/null +++ b/extra/bulk_edits/2022-07-19_doaj.md @@ -0,0 +1,78 @@ + +Doing a batch import of DOAJ articles. Will need to do another one of these +soon after setting up daily (OAI-PMH feed) ingest. + +## Prep + + wget https://doaj.org/csv + wget https://doaj.org/public-data-dump/journal + wget https://doaj.org/public-data-dump/article + + mv csv journalcsv__doaj_20220719_2135_utf8.csv + mv journal doaj_journal_data_2022-07-19.tar.gz + mv article doaj_article_data_2022-07-19.tar.gz + + ia upload doaj_data_2022-07-19 -m collection:ia_biblio_metadata ../logo_cropped.jpg journalcsv__doaj_20220719_2135_utf8.csv doaj_journal_data_2022-07-19.tar.gz doaj_article_data_2022-07-19.tar.gz + + tar xvf doaj_journal_data_2022-07-19.tar.gz + cat doaj_journal_data_*/journal_batch_*.json | jq .[] -c | pv -l | gzip > doaj_journal_data_2022-07-19_all.json.gz + + tar xvf doaj_article_data_2022-07-19.tar.gz + cat doaj_article_data_*/article_batch*.json | jq .[] -c | pv -l | gzip > doaj_article_data_2022-07-19_all.json.gz + + ia upload doaj_data_2022-07-19 doaj_journal_data_2022-07-19_all.json.gz doaj_article_data_2022-07-19_all.json.gz + +On fatcat machine: + + cd /srv/fatcat/datasets + wget https://archive.org/download/doaj_data_2022-07-19/doaj_article_data_2022-07-19_all.json.gz + +## Prod Article Import + + git rev: 582495f66e5e08b6e257360097807711e53008d4 + (includes DOAJ container-id required patch) + + date: Tue Jul 19 22:46:42 UTC 2022 + + `doaj_id:*`: 1,335,195 hits + +Start with sample: + + zcat /srv/fatcat/datasets/doaj_article_data_2022-07-19_all.json.gz | shuf -n1000 > /srv/fatcat/datasets/doaj_article_data_2022-07-19_sample.json + + export FATCAT_AUTH_WORKER_DOAJ=[...] + cat /srv/fatcat/datasets/doaj_article_data_2022-07-19_sample.json | pv -l | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + # Counter({'total': 1000, 'exists': 895, 'exists-fuzzy': 93, 'insert': 9, 'skip': 3, 'skip-no-container': 3, 'update': 0}) + +Pretty few imports. + +Full ingest: + + export FATCAT_AUTH_WORKER_DOAJ=[...] + zcat /srv/fatcat/datasets/doaj_article_data_2022-07-19_all.json.gz | pv -l | parallel -j6 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + # Counter({'total': 1282908, 'exists': 1145439, 'exists-fuzzy': 117120, 'insert': 16357, 'skip': 3831, 'skip-no-container': 2641, 'skip-title': 1190, 'skip-doaj-id-mismatch': 161, 'update': 0}) + +Times 6x, around 100k releases added. + +Got a bunch of: + + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=fcdb7a7a9729403d8d99a21f6970dd1d ident=wesvmjwihvblzayfmrvvgr4ulm + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=1455dfe24583480883dbbb293a4bc0c6 ident=lfw57esesjbotms3grvvods5dq + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=88fa65a33c8e484091fc76f4cda59c25 ident=22abqt5qe5e7ngjd5fkyvzyc4q + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=eb7b03dc3dc340cea36891a68a50cce7 ident=ljedohlfyzdkxebgpcswjtd77q + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=519617147ce248ea88d45ab098342153 ident=a63bqkttrbhyxavfr7li2w2xf4 + +Should investigate! + +Also, noticed that DOAJ importer is hitting `api.fatcat.wiki`, not the public +API endpoint. Guessing this is via fuzzycat. + +1,434,266 results for `doaj_id:*`. + +Then did a follow-up sandcrawler ingest, see notes in that repository. Note +that newer ingest can crawl doaj.org, bypassing the sandcrawler SQL load, but +the direct crawling is probably still faster. diff --git a/extra/bulk_edits/2022-07-29_chocula.md b/extra/bulk_edits/2022-07-29_chocula.md new file mode 100644 index 00000000..1f6f36ca --- /dev/null +++ b/extra/bulk_edits/2022-07-29_chocula.md @@ -0,0 +1,47 @@ + +Periodic import of chocula metadata updates. + +In particular, expecting a bunch of `publisher_type` updates. + +Going to explicitly not do DOAJ-only updates this time around. That is, if +container would have been updated, then new DOAJ 'extra' metadata will pass +through. But don't only update entity for this reason. This is to reduce churn +based only on the `as-of` key. Should probably change the behavior next time +around. + +## Prod Import + + date + # Sat Jul 30 01:18:41 UTC 2022 + + git log -n1 + # 5ecf72cbb488a9a50eb869ea55b4c2bfc1440731 + + diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py + index 38802bcb..762c44dd 100644 + --- a/python/fatcat_tools/importers/chocula.py + +++ b/python/fatcat_tools/importers/chocula.py + @@ -139,7 +139,7 @@ class ChoculaImporter(EntityImporter): + if ce.extra.get("publisher_type") and not ce.extra.get("publisher_type"): + # many older containers were missing this metadata + do_update = True + - for k in ("kbart", "ia", "doaj"): + + for k in ("kbart", "ia"): + # always update these fields if not equal (chocula override) + if ce.extra.get(k) and ce.extra[k] != existing.extra.get(k): + do_update = True + + export FATCAT_AUTH_WORKER_JOURNAL_METADATA=[...] + shuf -n100 /srv/fatcat/datasets/chocula_fatcat_export.2022-07-30.json | ./fatcat_import.py chocula --do-updates - + # Counter({'total': 100, 'exists': 98, 'exists-skip-update': 98, 'update': 2, 'skip': 0, 'insert': 0}) + + shuf -n1000 /srv/fatcat/datasets/chocula_fatcat_export.2022-07-30.json | ./fatcat_import.py chocula --do-updates - + # Counter({'total': 1000, 'exists': 986, 'exists-skip-update': 986, 'update': 12, 'insert': 2, 'skip': 0}) + +Huh, not seeing any `publisher_type` changes, which I was expecting more of. + + time cat /srv/fatcat/datasets/chocula_fatcat_export.2022-07-30.json | ./fatcat_import.py chocula --do-updates - + # Counter({'total': 188506, 'exists': 185808, 'exists-skip-update': 185806, 'update': 2495, 'insert': 203, 'exists-by-issnl': 2, 'skip': 0}) + +Looking through the changelog, some did through with `publisher_type` updates. +Whew! diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md index 278dc1d8..716c95d6 100644 --- a/extra/bulk_edits/CHANGELOG.md +++ b/extra/bulk_edits/CHANGELOG.md @@ -9,6 +9,48 @@ this file should probably get merged into the guide at some point. This file should not turn in to a TODO list! +## 2022-07 + +Ran a journal-level metadata update, using chocula. + +Cleaned up just under 500 releases with missing `container_id` from an older +DOAJ article import. + +Imported roughly 100k releases from DOAJ, new since 2022-04. + +Imported roughly 2.7 million new ORCiD `creator` entities, using the 2021 dump +(first update since 2020 dump). + +Imported almost 1 million new DOI release entities from JALC, first update in +more than a year. + +Imported at least 400 new dblp containers, and an unknown number of new dblp +release entities. + +Cleaned up about a thousand containers with incorrect `publisher_type`, based +on current publisher name. Further updates will populate after the next chocula +import. + +Ran a second batch of journal-level metadata updates, from chocula, resulting +in a couple thousand updated entities. + + +## 2022-04 + +Imported some initial fileset entities. + +Updated about 25k file entities from isiarticles.com, which are samples (spam +for translation service) to remove release linkage and set +`content_scope=sample` (similar to the springer "page one" case). + +## 2022-03 + +Ran a journal-level metadata update, using chocula. + +Run a DOAJ article-level metadata import, yielding a couple hundred thousand +new release entities. Crawling and bulk ingest of HTML and PDF fulltext for +these articles also started. + ## 2022-02 - removed `container_id` linkage for some Datacite DOI releases which are diff --git a/extra/cleanups/container_publisher_type.md b/extra/cleanups/container_publisher_type.md new file mode 100644 index 00000000..dba800d3 --- /dev/null +++ b/extra/cleanups/container_publisher_type.md @@ -0,0 +1,100 @@ + +A bunch of MDPI journals are incorrectly listed as 'longtail'. + + fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --count + # 245 + +Because this is 'extra' metadata, need a little python script to change the +metadata (fatcat-cli doesn't have this feature yet): + + import sys + import json + + publisher_type = sys.argv[1].strip().lower() + #print(publisher_type, file=sys.stderr) + + for line in sys.stdin: + if not line.strip(): + continue + container = json.loads(line) + container["extra"]["publisher_type"] = publisher_type + print(json.dumps(container)) + +Run some cleanups: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --entity-json --limit 50 \ + | jq 'select(.publisher_type != "oa")' -c \ + | python3 ./container_publisher_type.py oa \ + | fatcat-cli batch update container --description "Update container publisher_type" + # editgroup_oum6mnkl2rbn3jaua4a2gdlj5q + +Looks good, run the rest: + + fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "oa")' -c \ + | python3 ./container_publisher_type.py oa \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + +Some more cleanup patterns: + + fatcat-cli search container 'publisher:"Frontiers Media SA" publisher_type:* !publisher_type:oa' --count + # 84 + + fatcat-cli search container 'publisher:"Frontiers Media SA" publisher_type:* !publisher_type:oa' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "oa")' -c \ + | python3 ./container_publisher_type.py oa \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"Walter de Gruyter" publisher_type:* !publisher_type:commercial !publisher_type:archive' --count + # 47 + + fatcat-cli search container 'publisher:"Walter de Gruyter" publisher_type:* !publisher_type:commercial !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "commercial")' -c \ + | python3 ./container_publisher_type.py commercial \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"springer" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 56 + + fatcat-cli search container 'publisher:"springer" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"elsevier" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 98 + + fatcat-cli search container 'publisher:"elsevier" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"wiley" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 37 + + fatcat-cli search container 'publisher:"wiley" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:taylor publisher:francis publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 558 + + fatcat-cli search container 'publisher:taylor publisher:francis publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:sage publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 28 + + fatcat-cli search container 'publisher:sage publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + +Overall, around a thousand containers updated. Changes to releases will not be +reflected until they are re-indexed. diff --git a/extra/cleanups/file_isiarticles.md b/extra/cleanups/file_isiarticles.md new file mode 100644 index 00000000..3858361c --- /dev/null +++ b/extra/cleanups/file_isiarticles.md @@ -0,0 +1,20 @@ + +The domain isiarticles.com hosts a bunch of partial spam PDFs. + +As a first pass, we can remove these via the domain itself. + +A "blocklist" for this domain has been added to sandcrawler, so they should not +get auto-ingested in the future. + + # 2022-04-20 + fatcat-cli search file domain:isiarticles.com --count + 25067 + +## Prod Cleanup + +See bulk edits log. + +Verify cleanup: + + fatcat-cli search file domain:isiarticles.com '!content_scope:*' --count + 0 diff --git a/extra/container_count_update/update_prod.sh b/extra/container_count_update/update_prod.sh new file mode 100755 index 00000000..766398f2 --- /dev/null +++ b/extra/container_count_update/update_prod.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +set -euo pipefail + +export CONTAINER_INDEX=fatcat_container_v05_20220110 + +fatcat-cli search container --index-json --limit 0 state:active \ + | jq .ident -r \ + | pv -l \ + > container_idents.tsv + +cat container_idents.tsv \ + | parallel -j10 curl --fail -s 'https://fatcat.wiki/container/{}/stats.json' \ + | jq -c . \ + | pv -l \ + > container_stats.json + +cat container_stats.json \ + | jq '{ ident: .ident, releases_total: .total, preservation_bright: .preservation.bright, preservation_dark: .preservation.dark, preservation_shadows_only: .preservation.shadows_only, preservation_none: .preservation.none }' -c \ + | esbulk -verbose -index $CONTAINER_INDEX -optype update -id ident diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore index a04dd76e..60774a12 100644 --- a/extra/dblp/.gitignore +++ b/extra/dblp/.gitignore @@ -4,3 +4,9 @@ series/ Pipfile.lock *.json *.html +*.txt +*.dtd +*.xml +*.xml.gz +*.tsv +*.json.gz diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile index dbf86ac0..69705a3a 100644 --- a/extra/dblp/Pipfile +++ b/extra/dblp/Pipfile @@ -5,6 +5,7 @@ name = "pypi" [packages] selectolax = "*" +urlcanon = "*" [dev-packages] diff --git a/extra/dblp/README.md b/extra/dblp/README.md index e6ccce4f..a95f7214 100644 --- a/extra/dblp/README.md +++ b/extra/dblp/README.md @@ -1,14 +1,51 @@ -This file describes hacks used to import dblp container metadata. +This file describes hacks used to import dblp container and release metadata. -As of December 2020 this is part of the dblp release metadata import pipeline: -we must have conference and other non-ISSN containers created before running -the release import. dblp does not publish container-level metadata in a -structured format (eg, in their dumps), so scraping the HTML is unfortunately -necessary. +The container metadata must be processed and imported first, to create +containers for non-ISSN venues. However, dblp only publishes structured +metadata for articles (releases), not venues (containers), so we need to +process the articles, then import the containers, then import the articles. +There is a path that scrapes venue metadata out of dblp.org HTML. -## Quick Bootstrap Commands + +## New Process (2022) + +Usually all of this gets run on a production fatcat instance. It may be +possible to run parts elsewhere, but not confirmed, and would require copying +some set of files around. + + # remove any old/stale files + ./cleanup.sh + + ./prep_container_metadata.sh + +This will take a while to run, after which the container metadata can be +imported, like: + + cd ../../python + pipenv shell + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json + +Check that counts look sane: + + wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt + +Then do release import like: + + cd ../../python + pipenv shell + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml + +Lastly, to generate sandcrawler ingest requests, from the JSON-dumped partial +release objects:: + + cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz + + +## [OLD] Manual Commands Set up a working directory somewhere: diff --git a/extra/dblp/cleanup.sh b/extra/dblp/cleanup.sh new file mode 100755 index 00000000..52e1a2ea --- /dev/null +++ b/extra/dblp/cleanup.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# run this as 'fatcat' user on a production machine + +rm -f dblp.dtd +rm -f dblp.xml.gz +rm -f dblp.xml +rm -f dblp_releases_partial.json +rm -f prefix_list.txt +rm -f dblp_container_meta.json +rm -f existing_dblp_containers.tsv +rm -f all_dblp_containers.tsv + +rm -rf ./journals/ +rm -rf ./conf/ +rm -rf ./series/ + diff --git a/extra/dblp/dblp2ingestrequest.py b/extra/dblp/dblp2ingestrequest.py new file mode 100755 index 00000000..bdf5575d --- /dev/null +++ b/extra/dblp/dblp2ingestrequest.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Transform a transformed, fatcat-like dblp object (JSON) into zero or more +sandcrawler ingest requests. +""" + +import argparse +import json +import sys + +import urlcanon + +DOMAIN_BLOCKLIST = [ + # we crawl some of these directly via extid; others are just catalogs + "://arxiv.org/", + "://europepmc.org/", + #"://hdl.handle.net/", + "ncbi.nlm.nih.gov/", + "://doi.org/", + "zenodo.org/", + "figshare.com/", + "://d-nb.info/", + "://www.base-search.net/", +] + + +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) + + +def transform(obj): + """ + Transforms from a single object to zero or more ingest requests. + Returns a list of dicts. + """ + + requests = [] + if not obj["ext_ids"].get("dblp"): + return requests + if not obj.get("_dblp_ee_urls"): + return requests + + for url in obj["_dblp_ee_urls"]: + skip = False + for domain in DOMAIN_BLOCKLIST: + if domain in url: + skip = True + if skip: + continue + try: + base_url = canon(url) + except UnicodeEncodeError: + continue + + request = { + "base_url": base_url, + "ingest_type": "pdf", + "link_source": "dblp", + "link_source_id": obj["ext_ids"]["dblp"], + "ingest_request_source": "dblp", + "release_stage": obj.get("release_stage") or None, + "ext_ids": { + "dblp": obj["ext_ids"]["dblp"], + }, + "edit_extra": {}, + } + requests.append(request) + + return requests + + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + requests = transform(row) or [] + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="dblp transformed JSON file to use", type=argparse.FileType("r") + ) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + + +if __name__ == "__main__": + main() diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh new file mode 100755 index 00000000..21a50ab0 --- /dev/null +++ b/extra/dblp/prep_metadata.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# run this as 'fatcat' user on a production machine +#export FATCAT_API_HOST="https://api.fatcat.wiki/v0" + +set -e -u -o pipefail + +# ensure deps +#alias fd=fdfind +fd -h > /dev/null +fatcat-cli -h > /dev/null +pipenv -h > /dev/null + +# ensure pipenv is ready +pipenv install +pipenv run true + + +wget -c 'https://dblp.org/xml/dblp.dtd' +wget -c 'https://dblp.org/xml/dblp.xml.gz' + +zcat dblp.xml.gz > dblp.xml + +cd ../../python +pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json + +cd ../extra/dblp/ + +cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt + +mkdir -p journals +mkdir -p conf +mkdir -p series + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +# clean up any failed/empty files, then re-run the above parallel/wget command +find . -empty -type f -delete + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +find . -empty -type f -delete + +fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json + +fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv + +cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz diff --git a/extra/sql_dumps/Makefile b/extra/sql_dumps/Makefile new file mode 100644 index 00000000..01607d34 --- /dev/null +++ b/extra/sql_dumps/Makefile @@ -0,0 +1,93 @@ + +SHELL=/bin/bash -euo pipefail +TODAY ?= $(shell date --iso --utc) +DATADIR ?= /srv/fatcat/snapshots/$(TODAY) +DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S) +DATABASE_URL ?= fatcat_prod + +.PHONY: help +help: ## Print info about all commands + @echo "Commands:" + @echo + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' + +.PHONY: create_datadir +create_datadir: + mkdir -p $(DATADIR)/ + sudo chmod a+rw $(DATADIR)/ + +$(DATADIR)/.IDENTS: + sudo -u postgres DATABASE_URL=$(DATABASE_URL) ./ident_table_snapshot.sh $(DATADIR) + sudo -u postgres mv /tmp/fatcat_ident_*.tsv $(DATADIR) + touch $@ + +$(DATADIR)/release_export_expanded.json.gz: $(DATADIR)/.IDENTS + cd ../../rust; cat $(DATADIR)/fatcat_ident_releases_by_work.tsv | sudo -u fatcat ./target/release/fatcat-export releasebywork --expand files,filesets,webcaptures,container -j8 | pigz > $@.wip + mv $@.wip $@ + +$(DATADIR)/creator_export.json.gz: $(DATADIR)/.IDENTS + cd ../../rust; cat $(DATADIR)/fatcat_ident_creators.tsv | sudo -u fatcat ./target/release/fatcat-export creator -j8 | pigz > $@.wip + mv $@.wip $@ + +$(DATADIR)/container_export.json.gz: $(DATADIR)/.IDENTS + cd ../../rust; cat $(DATADIR)/fatcat_ident_containers.tsv | sudo -u fatcat ./target/release/fatcat-export container -j8 | pigz > $@.wip + mv $@.wip $@ + +$(DATADIR)/file_export.json.gz: $(DATADIR)/.IDENTS + cd ../../rust; cat $(DATADIR)/fatcat_ident_files.tsv | sudo -u fatcat ./target/release/fatcat-export file -j8 | pigz > $@.wip + mv $@.wip $@ + +$(DATADIR)/fileset_export.json.gz: $(DATADIR)/.IDENTS + cd ../../rust; cat $(DATADIR)/fatcat_ident_filesets.tsv | sudo -u fatcat ./target/release/fatcat-export fileset -j8 | pigz > $@.wip + mv $@.wip $@ + +$(DATADIR)/webcapture_export.json.gz: $(DATADIR)/.IDENTS + cd ../../rust; cat $(DATADIR)/fatcat_ident_webcaptures.tsv | sudo -u fatcat ./target/release/fatcat-export webcapture -j8 | pigz > $@.wip + mv $@.wip $@ + +$(DATADIR)/abstracts.json.gz: + sudo -u postgres psql $(DATABASE_URL) < dump_abstracts.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | pigz > $@.wip + mv $@.wip $@ + +$(DATADIR)/file_hashes.tsv.gz: + sudo -u postgres psql $(DATABASE_URL) < dump_file_hashes.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | pigz > $@.wip + mv $@.wip $@ + +$(DATADIR)/release_extid.tsv.gz: + sudo -u postgres psql $(DATABASE_URL) < dump_release_extid.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | pigz > $@.wip + mv $@.wip $@ + +$(DATADIR)/.METADATA_EXPORT: $(DATADIR)/.IDENTS $(DATADIR)/release_export_expanded.json.gz $(DATADIR)/creator_export.json.gz $(DATADIR)/container_export.json.gz $(DATADIR)/file_export.json.gz $(DATADIR)/fileset_export.json.gz $(DATADIR)/webcapture_export.json.gz $(DATADIR)/abstracts.json.gz $(DATADIR)/file_hashes.tsv.gz $(DATADIR)/release_extid.tsv.gz ## Dump bulk metadata to disk + touch $@ + +.PHONY: metadata-exports +metadata-exports: create_datadir $(DATADIR)/.METADATA_EXPORT ## Dump bulk metadata to disk + @echo + + +$(DATADIR)/.METADATA_UPLOADED: $(DATADIR)/.METADATA_EXPORT + ia upload --checksum fatcat_bulk_exports_$(TODAY) ia_exports_item_readme.md --remote-name=README.md -m collection:fatcat_snapshots_and_exports -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Fatcat Bulk Metadata Exports ($(TODAY))" + ia upload fatcat_bulk_exports_$(TODAY) $(DATADIR)/*_export.json.gz $(DATADIR)/*_export_expanded.json.gz $(DATADIR)/abstracts.json.gz $(DATADIR)/file_hashes.tsv.gz $(DATADIR)/release_extid.tsv.gz + touch $@ + +.PHONY: upload-metadata-exports +upload-metadata-exports: create_datadir $(DATADIR)/.METADATA_UPLOADED ## Upload bulk metadata exports to archive.org + @echo + +$(DATADIR)/.PUBLIC_DB_DUMP: + sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=auth_oidc fatcat_prod > $(DATADIR)/fatcat_public_dbdump_${DATESLUG}.pgdump.wip + mv $(DATADIR)/fatcat_public_dbdump_${DATESLUG}.pgdump.wip $(DATADIR)/fatcat_public_dbdump_${DATESLUG}.pgdump + touch $@ + +.PHONY: public-database-snapshot +public-database-snapshot: create_datadir $(DATADIR)/.PUBLIC_DB_DUMP ## Create SQL database snapshot which can be shared publicly + @echo + +$(DATADIR)/.PUBLIC_DB_UPLOADED: $(DATADIR)/.PUBLIC_DB_DUMP + ia upload --checksum fatcat_sqldump_public_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:fatcat_snapshots_and_exports -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Fatcat Public Database Snapshot ($(TODAY))" + ia upload --checksum fatcat_sqldump_public_$(TODAY) $(DATADIR)/fatcat_public_dbdump_*.pgdump + touch $@ + +.PHONY: upload-public-database-snapshot +upload-public-database-snapshot: create_datadir public-database-snapshot $(DATADIR)/.PUBLIC_DB_UPLOADED ## Upload metadata snapshot to archive.org + @echo diff --git a/extra/stats/2022-03-21-prod-stats.json b/extra/stats/2022-03-21-prod-stats.json new file mode 100644 index 00000000..4a82860f --- /dev/null +++ b/extra/stats/2022-03-21-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":5850551,"timestamp":"2022-03-21T22:32:20.050613+00:00"}},"container":{"total":190480},"papers":{"in_kbart":77243855,"in_web":33064706,"in_web_not_kbart":15982780,"is_oa":24345482,"total":126701207},"release":{"refs_total":1269080199,"total":180472435}} diff --git a/extra/stats/2022-03-21-prod-table-sizes.txt b/extra/stats/2022-03-21-prod-table-sizes.txt new file mode 100644 index 00000000..328deec0 --- /dev/null +++ b/extra/stats/2022-03-21-prod-table-sizes.txt @@ -0,0 +1,47 @@ +PostgreSQL 13.5 - wbgrp-svc502.us.archive.org +Size: 707.94G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 86 GB | 31 GB | 117 GB + "public"."refs_blob" | 114 GB | 2185 MB | 116 GB + "public"."release_rev" | 82 GB | 25 GB | 107 GB + "public"."file_rev" | 35 GB | 29 GB | 64 GB + "public"."release_edit" | 18 GB | 20 GB | 38 GB + "public"."file_rev_url" | 30 GB | 7856 MB | 37 GB + "public"."work_edit" | 17 GB | 19 GB | 36 GB + "public"."abstracts" | 33 GB | 2829 MB | 36 GB + "public"."file_edit" | 17 GB | 15 GB | 33 GB + "public"."release_ident" | 12 GB | 12 GB | 23 GB + "public"."work_ident" | 11 GB | 11 GB | 23 GB + "public"."file_rev_release" | 8709 MB | 10 GB | 19 GB + "public"."file_ident" | 7478 MB | 7579 MB | 15 GB + "public"."work_rev" | 7552 MB | 5238 MB | 12 GB + "public"."release_ref" | 6486 MB | 5199 MB | 11 GB + "public"."release_rev_abstract" | 4718 MB | 5174 MB | 9892 MB + "public"."webcapture_rev_cdx" | 3491 MB | 338 MB | 3829 MB + "public"."creator_edit" | 934 MB | 1042 MB | 1976 MB + "public"."creator_rev" | 928 MB | 730 MB | 1658 MB + "public"."editgroup" | 1224 MB | 252 MB | 1476 MB + "public"."creator_ident" | 631 MB | 647 MB | 1277 MB + "public"."release_rev_extid" | 515 MB | 641 MB | 1157 MB + "public"."changelog" | 351 MB | 297 MB | 648 MB + "public"."container_rev" | 228 MB | 45 MB | 273 MB + "public"."webcapture_edit" | 66 MB | 47 MB | 113 MB + "public"."container_edit" | 59 MB | 54 MB | 113 MB + "public"."webcapture_rev_url" | 54 MB | 20 MB | 74 MB + "public"."webcapture_rev_release" | 20 MB | 35 MB | 54 MB + "public"."webcapture_rev" | 38 MB | 14 MB | 51 MB + "public"."webcapture_ident" | 22 MB | 27 MB | 49 MB + "public"."container_ident" | 13 MB | 20 MB | 33 MB + "public"."editor" | 88 kB | 128 kB | 216 kB + "public"."auth_oidc" | 88 kB | 120 kB | 208 kB + "public"."editgroup_annotation" | 72 kB | 48 kB | 120 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) diff --git a/extra/stats/2022-04-20-prod-stats.json b/extra/stats/2022-04-20-prod-stats.json new file mode 100644 index 00000000..90c673d5 --- /dev/null +++ b/extra/stats/2022-04-20-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":5894575,"timestamp":"2022-04-20T04:19:07.676356+00:00"}},"container":{"total":191000},"papers":{"in_kbart":77487575,"in_web":33633575,"in_web_not_kbart":16531221,"is_oa":24576975,"total":127353365},"release":{"refs_total":1288004961,"total":181877321}} diff --git a/extra/stats/2022-05-15-prod-stats.json b/extra/stats/2022-05-15-prod-stats.json new file mode 100644 index 00000000..37c83f69 --- /dev/null +++ b/extra/stats/2022-05-15-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":5946444,"timestamp":"2022-05-15T21:40:19.956944+00:00"}},"container":{"total":191315},"papers":{"in_kbart":77657266,"in_web":34881556,"in_web_not_kbart":17596263,"is_oa":24739223,"total":127770175},"release":{"refs_total":1303116536,"total":182728162}} diff --git a/extra/stats/2022-07-06-prod-stats.json b/extra/stats/2022-07-06-prod-stats.json new file mode 100644 index 00000000..c93b4e0c --- /dev/null +++ b/extra/stats/2022-07-06-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":6011966,"timestamp":"2022-07-06T23:45:10.513758+00:00"}},"container":{"total":193199},"papers":{"in_kbart":78038966,"in_web":35774136,"in_web_not_kbart":18362815,"is_oa":25221089,"total":128843120},"release":{"refs_total":1335962190,"total":184701189}} diff --git a/extra/stats/2022-07-06-prod-table-sizes.txt b/extra/stats/2022-07-06-prod-table-sizes.txt new file mode 100644 index 00000000..01d205b1 --- /dev/null +++ b/extra/stats/2022-07-06-prod-table-sizes.txt @@ -0,0 +1,48 @@ +PostgreSQL 13.5 - wbgrp-svc502.us.archive.org +Size: 732.62G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 88 GB | 32 GB | 120 GB + "public"."refs_blob" | 118 GB | 2196 MB | 120 GB + "public"."release_rev" | 85 GB | 25 GB | 110 GB + "public"."file_rev" | 36 GB | 29 GB | 65 GB + "public"."release_edit" | 18 GB | 21 GB | 39 GB + "public"."file_rev_url" | 31 GB | 7999 MB | 39 GB + "public"."abstracts" | 35 GB | 3655 MB | 38 GB + "public"."work_edit" | 17 GB | 19 GB | 37 GB + "public"."file_edit" | 18 GB | 16 GB | 34 GB + "public"."release_ident" | 12 GB | 12 GB | 23 GB + "public"."work_ident" | 12 GB | 11 GB | 23 GB + "public"."file_rev_release" | 8910 MB | 10 GB | 19 GB + "public"."file_ident" | 7704 MB | 7605 MB | 15 GB + "public"."work_rev" | 7741 MB | 5238 MB | 13 GB + "public"."release_ref" | 6714 MB | 5646 MB | 12 GB + "public"."release_rev_abstract" | 5015 MB | 7213 MB | 12 GB + "public"."webcapture_rev_cdx" | 4340 MB | 419 MB | 4758 MB + "public"."creator_edit" | 934 MB | 1042 MB | 1976 MB + "public"."creator_rev" | 928 MB | 730 MB | 1658 MB + "public"."editgroup" | 1282 MB | 256 MB | 1537 MB + "public"."creator_ident" | 631 MB | 647 MB | 1277 MB + "public"."release_rev_extid" | 522 MB | 648 MB | 1170 MB + "public"."changelog" | 378 MB | 301 MB | 679 MB + "public"."container_rev" | 249 MB | 60 MB | 308 MB + "public"."webcapture_edit" | 82 MB | 53 MB | 135 MB + "public"."container_edit" | 63 MB | 69 MB | 132 MB + "public"."webcapture_rev_url" | 65 MB | 22 MB | 87 MB + "public"."webcapture_rev_release" | 24 MB | 35 MB | 59 MB + "public"."webcapture_rev" | 45 MB | 14 MB | 59 MB + "public"."webcapture_ident" | 27 MB | 27 MB | 54 MB + "public"."container_ident" | 13 MB | 20 MB | 34 MB + "public"."auth_oidc" | 104 kB | 160 kB | 264 kB + "public"."editor" | 96 kB | 160 kB | 256 kB + "public"."editgroup_annotation" | 80 kB | 48 kB | 128 kB + "public"."fileset_rev_file" | 88 kB | 32 kB | 120 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) + diff --git a/extra/stats/2022-07-14-prod-stats.json b/extra/stats/2022-07-14-prod-stats.json new file mode 100644 index 00000000..62d06606 --- /dev/null +++ b/extra/stats/2022-07-14-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":6036957,"timestamp":"2022-07-14T18:53:18.228827+00:00"}},"container":{"total":193300},"papers":{"in_kbart":78102604,"in_web":36247601,"in_web_not_kbart":18551021,"is_oa":25281045,"total":128995907},"release":{"refs_total":1340195856,"total":184966214}} diff --git a/extra/stats/2022-07-14-prod-table-sizes.txt b/extra/stats/2022-07-14-prod-table-sizes.txt new file mode 100644 index 00000000..b4fae69a --- /dev/null +++ b/extra/stats/2022-07-14-prod-table-sizes.txt @@ -0,0 +1,47 @@ +PostgreSQL 13.5 - wbgrp-svc502.us.archive.org +Size: 735.11G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 88 GB | 32 GB | 121 GB + "public"."refs_blob" | 119 GB | 2200 MB | 121 GB + "public"."release_rev" | 85 GB | 25 GB | 110 GB + "public"."file_rev" | 36 GB | 29 GB | 65 GB + "public"."release_edit" | 18 GB | 21 GB | 39 GB + "public"."file_rev_url" | 31 GB | 8106 MB | 39 GB + "public"."abstracts" | 35 GB | 3671 MB | 39 GB + "public"."work_edit" | 17 GB | 20 GB | 37 GB + "public"."file_edit" | 18 GB | 16 GB | 34 GB + "public"."release_ident" | 12 GB | 12 GB | 23 GB + "public"."work_ident" | 12 GB | 11 GB | 23 GB + "public"."file_rev_release" | 8975 MB | 10 GB | 19 GB + "public"."file_ident" | 7775 MB | 7615 MB | 15 GB + "public"."work_rev" | 7753 MB | 5238 MB | 13 GB + "public"."release_ref" | 6721 MB | 5662 MB | 12 GB + "public"."release_rev_abstract" | 5035 MB | 7250 MB | 12 GB + "public"."webcapture_rev_cdx" | 4341 MB | 419 MB | 4760 MB + "public"."creator_edit" | 934 MB | 1042 MB | 1976 MB + "public"."creator_rev" | 928 MB | 730 MB | 1658 MB + "public"."editgroup" | 1294 MB | 256 MB | 1550 MB + "public"."creator_ident" | 631 MB | 647 MB | 1277 MB + "public"."release_rev_extid" | 524 MB | 649 MB | 1173 MB + "public"."changelog" | 383 MB | 301 MB | 685 MB + "public"."container_rev" | 249 MB | 60 MB | 308 MB + "public"."webcapture_edit" | 82 MB | 53 MB | 135 MB + "public"."container_edit" | 63 MB | 69 MB | 132 MB + "public"."webcapture_rev_url" | 65 MB | 22 MB | 87 MB + "public"."webcapture_rev_release" | 24 MB | 35 MB | 59 MB + "public"."webcapture_rev" | 45 MB | 14 MB | 59 MB + "public"."webcapture_ident" | 27 MB | 27 MB | 54 MB + "public"."container_ident" | 13 MB | 20 MB | 34 MB + "public"."auth_oidc" | 104 kB | 160 kB | 264 kB + "public"."editor" | 96 kB | 160 kB | 256 kB + "public"."editgroup_annotation" | 80 kB | 48 kB | 128 kB + "public"."fileset_rev_file" | 88 kB | 32 kB | 120 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) diff --git a/extra/stats/2022-07-29-prod-stats.json b/extra/stats/2022-07-29-prod-stats.json new file mode 100644 index 00000000..41d234ea --- /dev/null +++ b/extra/stats/2022-07-29-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":6143354,"timestamp":"2022-07-30T01:39:22.900415+00:00"}},"container":{"total":194187},"papers":{"in_kbart":78331069,"in_web":36847123,"in_web_not_kbart":18991886,"is_oa":25579026,"total":130376642},"release":{"refs_total":1350391488,"total":186556315}} diff --git a/extra/stats/2022-07-29-prod-table-sizes.txt b/extra/stats/2022-07-29-prod-table-sizes.txt new file mode 100644 index 00000000..cb85078f --- /dev/null +++ b/extra/stats/2022-07-29-prod-table-sizes.txt @@ -0,0 +1,48 @@ +PostgreSQL 13.5 - wbgrp-svc502.us.archive.org +Size: 748.75G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 89 GB | 33 GB | 122 GB + "public"."refs_blob" | 119 GB | 2218 MB | 121 GB + "public"."release_rev" | 86 GB | 25 GB | 111 GB + "public"."file_rev" | 36 GB | 29 GB | 65 GB + "public"."release_edit" | 19 GB | 21 GB | 40 GB + "public"."file_rev_url" | 31 GB | 8306 MB | 40 GB + "public"."abstracts" | 35 GB | 3700 MB | 39 GB + "public"."work_edit" | 17 GB | 20 GB | 37 GB + "public"."file_edit" | 19 GB | 16 GB | 34 GB + "public"."release_ident" | 12 GB | 12 GB | 24 GB + "public"."work_ident" | 12 GB | 11 GB | 23 GB + "public"."file_rev_release" | 9031 MB | 10 GB | 19 GB + "public"."file_ident" | 7837 MB | 7624 MB | 15 GB + "public"."work_rev" | 7823 MB | 5243 MB | 13 GB + "public"."release_ref" | 6882 MB | 6039 MB | 13 GB + "public"."release_rev_abstract" | 5100 MB | 7327 MB | 12 GB + "public"."webcapture_rev_cdx" | 8090 MB | 760 MB | 8849 MB + "public"."creator_edit" | 1203 MB | 1919 MB | 3122 MB + "public"."creator_rev" | 1198 MB | 1427 MB | 2624 MB + "public"."creator_ident" | 812 MB | 1258 MB | 2070 MB + "public"."editgroup" | 1325 MB | 261 MB | 1587 MB + "public"."release_rev_extid" | 537 MB | 668 MB | 1204 MB + "public"."changelog" | 395 MB | 307 MB | 702 MB + "public"."container_rev" | 251 MB | 61 MB | 312 MB + "public"."webcapture_edit" | 144 MB | 99 MB | 242 MB + "public"."webcapture_rev_url" | 113 MB | 42 MB | 155 MB + "public"."container_edit" | 63 MB | 71 MB | 135 MB + "public"."webcapture_rev_release" | 40 MB | 70 MB | 110 MB + "public"."webcapture_rev" | 77 MB | 27 MB | 104 MB + "public"."webcapture_ident" | 45 MB | 54 MB | 100 MB + "public"."container_ident" | 13 MB | 21 MB | 34 MB + "public"."auth_oidc" | 104 kB | 160 kB | 264 kB + "public"."editor" | 96 kB | 160 kB | 256 kB + "public"."editgroup_annotation" | 88 kB | 48 kB | 136 kB + "public"."fileset_rev_file" | 88 kB | 32 kB | 120 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) + diff --git a/extra/stats/2022-09-06-prod-stats.json b/extra/stats/2022-09-06-prod-stats.json new file mode 100644 index 00000000..e7755f9f --- /dev/null +++ b/extra/stats/2022-09-06-prod-stats.json @@ -0,0 +1,146 @@ +<!DOCTYPE html> +<html lang="en" style="position: relative; min-height: 100%; height: auto;"> +<head> + <meta charset="utf-8" /> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <meta name="referrer" content="origin-when-cross-origin"> + + <title>Stats | fatcat!</title> + + <link rel="stylesheet" + type="text/css" + href="https://cdn.jsdelivr.net/npm/fomantic-ui@2.8.6/dist/semantic.min.css" + type="text/css" + crossorigin="anonymous"> + <link rel="preload" + href="https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic&subset=latin&display=swap" + as="style"> + <link rel="preload" + href="https://cdn.jsdelivr.net/npm/fomantic-ui@2.8.6/dist/themes/default/assets/fonts/icons.woff2" + as="font" + type="font/woff2" + crossorigin="anonymous"> + <link rel="icon" href="data:,"> + + <style> + .ui.inverted.menu .item:before { background: none; } + + @media only screen and (max-width: 479px) { + .mobile-hide{ display: none !important; } + } + + .field textarea#toml { + font-family: monospace; + } + </style> +</head> +<body style="margin-bottom: 130px;"> + +<header class="ui fixed inverted menu" style="height: 3em;"> + <div class="ui container"> + <a href="/" class="header item"> + <!-- <img class="logo" src="assets/images/logo.png"> --> + fatcat! + </a> + <a href="/about" class="item mobile-hide">About</a> + <a href="https://guide.fatcat.wiki/" class="item mobile-hide">Guide</a> + <a href="/changelog" class="item mobile-hide">Changelog</a> + <div class="right menu"> + <div class="item" style="padding: 0;"> + <form class="" action="/search" method="get" role="search" aria-label="Papers"> + <div class="ui transparent inverted icon input"> + <i class="search icon" style="padding-right: 2em;"></i> + <input type="text" placeholder="Search Papers..." name="q" style="border: 1px solid #777 !important; padding: 5px !important; width: 15em;"> + </div> + </form> + </div> + <div class="ui simple item"> + <a href="/auth/login">Login/Signup</a> + </div> + </div> + </div> +</header> + + +<!-- 4em top margin is "enough" --> +<main class="ui main container" style="margin-top: 6em; margin-bottom: 2em;" > + <div class="ui container text"> + +<h1>Stats</h1> + +You can also fetch these numbers <a href="./stats.json">as JSON</a>. + +<h3>Changelog</h3> + +<p>Latest changelog index is 6205643 (2022-09-06). + +<h3>Entities</h3> + +<table class="ui structured table"> + <tbody> + <tr style="padding-top:0px;"><td rowspan="5" class="active top aligned center aligned"><b>"Papers"</b></td> + <td>Total</td> + <td class="right aligned">131,082,821</td> + <tr> + <td>Fulltext on web</td> + <td class="right aligned">37,792,347</td> + <tr> + <td>"Gold" Open Access</td> + <td class="right aligned">25,854,622</td> + <tr> + <td>In a Keepers/KBART archive</td> + <td class="right aligned">78,632,782</td> + <tr> + <td>On web, not in Keepers</td> + <td class="right aligned">19,770,890</td> + + <tr style="padding-top:0px;"><td rowspan="1" class="active top aligned center aligned"><b>Releases</b></td> + <td>Total</td> + <td class="right aligned">188,266,679</td> + + <tr style="padding-top:0px;"><td rowspan="1" class="active top aligned center aligned"><b>Containers</b></td> + <td>Total</td> + <td class="right aligned">194,685</td> + </tbody> +</table> + +<br> +<i>"Papers" are journal articles and conference proceedings, a subset of Releases</i> + + </div> +</main> + + +<footer class="ui inverted vertical footer segment" style="margin-top: 2em; padding-top: 2em; padding-bottom:2em; position: absolute; bottom: 0px; width: 100%;"> + <div class="ui center aligned container"> + <div class="ui horizontal inverted small divided link list"> + <a class="item" href="/">fatcat!</a> + <a class="item" href="/about">About</a> + <a class="item" href="https://guide.fatcat.wiki/sources.html">Sources</a> + <a class="item" href="https://stats.uptimerobot.com/GM9YNSrB0">Status</a> + <a class="item" href="https://guide.fatcat.wiki/bulk_exports.html">Bulk Exports</a> + <a class="item" href="https://github.com/internetarchive/fatcat/">Code</a> + <a class="item" href="https://github.com/internetarchive/fatcat/tree/v0.5.1-109-g5ecf72cb"><code>v0.5.1-109-g5ecf72cb</code></a> + </div> + </div> +</footer> + +<script + src="https://cdn.jsdelivr.net/npm/jquery@3.3.1/dist/jquery.min.js" + integrity="sha256-FgpCb/KJQlLNfOu91ta32o/NMZxltwRo8QtmkMRdAu8=" + crossorigin="anonymous"> +</script> +<script + src="https://cdn.jsdelivr.net/npm/fomantic-ui@2.8.6/dist/semantic.min.js" + integrity="sha256-9H3HWYnPJ2bEHgkOrw+48KheOqYzTvJd1hbeU9sEDFk=" + crossorigin="anonymous"> +</script> +<script> + window.goatcounter = { + /* title leaks search query, so don't report it */ + title: "", + }; + </script> + <script data-goatcounter="/goatcounter/count" async src="/goatcounter/count.js"></script> +</body> +</html>
\ No newline at end of file diff --git a/extra/stats/2022-09-06-table-sizes.txt b/extra/stats/2022-09-06-table-sizes.txt new file mode 100644 index 00000000..ddbd6842 --- /dev/null +++ b/extra/stats/2022-09-06-table-sizes.txt @@ -0,0 +1,48 @@ + +PostgreSQL 13.5 - wbgrp-svc502.us.archive.org +Size: 760.02G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 90 GB | 34 GB | 124 GB + "public"."refs_blob" | 121 GB | 2295 MB | 123 GB + "public"."release_rev" | 87 GB | 26 GB | 112 GB + "public"."file_rev" | 36 GB | 30 GB | 66 GB + "public"."file_rev_url" | 32 GB | 8778 MB | 40 GB + "public"."release_edit" | 19 GB | 21 GB | 40 GB + "public"."abstracts" | 36 GB | 3726 MB | 40 GB + "public"."work_edit" | 18 GB | 20 GB | 38 GB + "public"."file_edit" | 19 GB | 16 GB | 35 GB + "public"."release_ident" | 12 GB | 12 GB | 24 GB + "public"."work_ident" | 12 GB | 12 GB | 23 GB + "public"."file_rev_release" | 9100 MB | 10 GB | 19 GB + "public"."file_ident" | 7914 MB | 7647 MB | 15 GB + "public"."release_ref" | 7012 MB | 6486 MB | 13 GB + "public"."work_rev" | 7900 MB | 5280 MB | 13 GB + "public"."release_rev_abstract" | 5217 MB | 7395 MB | 12 GB + "public"."webcapture_rev_cdx" | 9173 MB | 862 MB | 10035 MB + "public"."creator_edit" | 1203 MB | 1919 MB | 3122 MB + "public"."creator_rev" | 1198 MB | 1427 MB | 2624 MB + "public"."creator_ident" | 812 MB | 1258 MB | 2070 MB + "public"."editgroup" | 1347 MB | 272 MB | 1620 MB + "public"."release_rev_extid" | 540 MB | 672 MB | 1212 MB + "public"."changelog" | 406 MB | 318 MB | 724 MB + "public"."container_rev" | 251 MB | 62 MB | 313 MB + "public"."webcapture_edit" | 168 MB | 105 MB | 273 MB + "public"."webcapture_rev_url" | 133 MB | 44 MB | 177 MB + "public"."container_edit" | 63 MB | 72 MB | 135 MB + "public"."webcapture_rev_release" | 48 MB | 71 MB | 119 MB + "public"."webcapture_rev" | 91 MB | 27 MB | 118 MB + "public"."webcapture_ident" | 54 MB | 55 MB | 108 MB + "public"."container_ident" | 13 MB | 21 MB | 34 MB + "public"."editor" | 104 kB | 168 kB | 272 kB + "public"."auth_oidc" | 104 kB | 160 kB | 264 kB + "public"."editgroup_annotation" | 88 kB | 48 kB | 136 kB + "public"."fileset_rev_file" | 88 kB | 32 kB | 120 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) |