diff options
Diffstat (limited to 'extra')
-rw-r--r-- | extra/bulk_edits/2022-07-12_jalc.md | 47 | ||||
-rw-r--r-- | extra/bulk_edits/2022-07-12_orcid.md | 64 | ||||
-rw-r--r-- | extra/bulk_edits/2022-07-19_doaj.md | 78 | ||||
-rw-r--r-- | extra/bulk_edits/CHANGELOG.md | 15 | ||||
-rw-r--r-- | extra/cleanups/container_publisher_type.md | 100 | ||||
-rw-r--r-- | extra/stats/2022-07-14-prod-stats.json | 1 | ||||
-rw-r--r-- | extra/stats/2022-07-14-prod-table-sizes.txt | 47 |
7 files changed, 352 insertions, 0 deletions
diff --git a/extra/bulk_edits/2022-07-12_jalc.md b/extra/bulk_edits/2022-07-12_jalc.md new file mode 100644 index 00000000..d9f09fee --- /dev/null +++ b/extra/bulk_edits/2022-07-12_jalc.md @@ -0,0 +1,47 @@ + +Import of a 2022-04 JALC DOI metadata snapshot. + +Note that we had downloaded a prior 2021-04 snapshot, but don't seem to have +ever imported it. + +## Download and Archive + +URL for bulk snapshot is available at the bottom of this page: <https://form.jst.go.jp/enquetes/jalcmetadatadl_1703> + +More info: <http://japanlinkcenter.org/top/service/service_data.html> + + wget 'https://japanlinkcenter.org/lod/JALC-LOD-20220401.gz?jalcmetadatadl_1703' + wget 'http://japanlinkcenter.org/top/doc/JaLC_LOD_format.pdf' + wget 'http://japanlinkcenter.org/top/doc/JaLC_LOD_sample.pdf' + + mv 'JALC-LOD-20220401.gz?jalcmetadatadl_1703' JALC-LOD-20220401.gz + + ia upload jalc-bulk-metadata-2022-04 -m collection:ia_biblio_metadata jalc_logo.png JALC-LOD-20220401.gz JaLC_LOD_format.pdf JaLC_LOD_sample.pdf + +## Import + +As of 2022-07-19, 6,502,202 release hits for `doi_registrar:jalc`. + +Re-download the file: + + cd /srv/fatcat/datasets + wget 'https://archive.org/download/jalc-bulk-metadata-2022-04/JALC-LOD-20220401.gz' + gunzip JALC-LOD-20220401.gz + cd /srv/fatcat/src/python + + wc -l /srv/fatcat/datasets/JALC-LOD-20220401 + 9525225 + +Start with some samples: + + export FATCAT_AUTH_WORKER_JALC=[...] + shuf -n100 /srv/fatcat/datasets/JALC-LOD-20220401 | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + # Counter({'total': 100, 'exists': 89, 'insert': 11, 'skip': 0, 'update': 0}) + +Full import (single threaded): + + cat /srv/fatcat/datasets/JALC-LOD-20220401 | pv -l | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + # 9.53M 22:26:06 [ 117 /s] + # Counter({'total': 9510096, 'exists': 8589731, 'insert': 915032, 'skip': 5333, 'inserted.container': 119, 'update': 0}) + +Wow, almost a million new releases! 7,417,245 results for `doi_registrar:jalc`. diff --git a/extra/bulk_edits/2022-07-12_orcid.md b/extra/bulk_edits/2022-07-12_orcid.md new file mode 100644 index 00000000..760a16c8 --- /dev/null +++ b/extra/bulk_edits/2022-07-12_orcid.md @@ -0,0 +1,64 @@ + +Annual ORCID import, using 2021 public data file. Didn't do this last year, so +a catch-up, and will need to do another update later in 2022 (presumably in +November/December). + +Not sure how many records this year. Current count on the orcid.org website is +over 14 million ORCIDs, in July 2022. + +Files download from: + +- <https://info.orcid.org/orcids-2021-public-data-file-is-now-available> +- <https://orcid.figshare.com/articles/dataset/ORCID_Public_Data_File_2021/16750535> +- <https://archive.org/details/orcid-dump-2021> + +## Prep + + ia upload orcid-dump-2021 -m collection:ia_biblio_metadata ORCID_2021_10_* orcid-logo.png + + wget https://github.com/ORCID/orcid-conversion-lib/raw/master/target/orcid-conversion-lib-3.0.7-full.jar + + java -jar orcid-conversion-lib-3.0.7-full.jar --tarball -i ORCID_2021_10_summaries.tar.gz -v v3_0 -o ORCID_2021_10_summaries_json.tar.gz + + tar xvf ORCID_2021_10_summaries_json.tar.gz + + fd .json ORCID_2021_10_summaries/ | parallel cat {} | jq . -c | pv -l | gzip > ORCID_2021_10_summaries.json.gz + # 12.6M 27:59:25 [ 125 /s] + + zcat ORCID_2021_10_summaries.json.gz | shuf -n10000 | gzip > ORCID_2021_10_summaries.sample_10k.json.gz + + ia upload orcid-dump-2021 ORCID_2021_10_summaries.json.gz ORCID_2021_10_summaries.sample_10k.json.gz + +## Import + +Fetch to prod machine: + + wget https://archive.org/download/orcid-dump-2021/ORCID_2021_10_summaries.json.gz + wget https://archive.org/download/orcid-dump-2021/ORCID_2021_10_summaries.sample_10k.json.gz + +Sample: + + export FATCAT_AUTH_WORKER_ORCID=[...] + zcat /srv/fatcat/datasets/ORCID_2021_10_summaries.sample_10k.json.gz | ./fatcat_import.py orcid - + # in 2020: Counter({'total': 10000, 'exists': 7356, 'insert': 2465, 'skip': 179, 'update': 0}) + # this time: Counter({'total': 10000, 'exists': 7577, 'insert': 2191, 'skip': 232, 'update': 0}) + +Bulk import: + + export FATCAT_AUTH_WORKER_ORCID=[...] + time zcat /srv/fatcat/datasets/ORCID_2021_10_summaries.json.gz | pv -l | parallel -j8 --round-robin --pipe ./fatcat_import.py orcid - + 12.6M 1:24:04 [2.51k/s] + Counter({'total': 1574111, 'exists': 1185437, 'insert': 347039, 'skip': 41635, 'update': 0}) + Counter({'total': 1583157, 'exists': 1193341, 'insert': 348187, 'skip': 41629, 'update': 0}) + Counter({'total': 1584441, 'exists': 1193385, 'insert': 349424, 'skip': 41632, 'update': 0}) + Counter({'total': 1575971, 'exists': 1187270, 'insert': 347190, 'skip': 41511, 'update': 0}) + Counter({'total': 1577323, 'exists': 1188892, 'insert': 346759, 'skip': 41672, 'update': 0}) + Counter({'total': 1586719, 'exists': 1195610, 'insert': 349115, 'skip': 41994, 'update': 0}) + Counter({'total': 1578484, 'exists': 1189423, 'insert': 347276, 'skip': 41785, 'update': 0}) + Counter({'total': 1578728, 'exists': 1190316, 'insert': 346445, 'skip': 41967, 'update': 0}) + + real 84m5.297s + user 436m26.428s + sys 41m36.959s + +Roughly 2.7 million new ORCIDs, great! diff --git a/extra/bulk_edits/2022-07-19_doaj.md b/extra/bulk_edits/2022-07-19_doaj.md new file mode 100644 index 00000000..d25f2dda --- /dev/null +++ b/extra/bulk_edits/2022-07-19_doaj.md @@ -0,0 +1,78 @@ + +Doing a batch import of DOAJ articles. Will need to do another one of these +soon after setting up daily (OAI-PMH feed) ingest. + +## Prep + + wget https://doaj.org/csv + wget https://doaj.org/public-data-dump/journal + wget https://doaj.org/public-data-dump/article + + mv csv journalcsv__doaj_20220719_2135_utf8.csv + mv journal doaj_journal_data_2022-07-19.tar.gz + mv article doaj_article_data_2022-07-19.tar.gz + + ia upload doaj_data_2022-07-19 -m collection:ia_biblio_metadata ../logo_cropped.jpg journalcsv__doaj_20220719_2135_utf8.csv doaj_journal_data_2022-07-19.tar.gz doaj_article_data_2022-07-19.tar.gz + + tar xvf doaj_journal_data_2022-07-19.tar.gz + cat doaj_journal_data_*/journal_batch_*.json | jq .[] -c | pv -l | gzip > doaj_journal_data_2022-07-19_all.json.gz + + tar xvf doaj_article_data_2022-07-19.tar.gz + cat doaj_article_data_*/article_batch*.json | jq .[] -c | pv -l | gzip > doaj_article_data_2022-07-19_all.json.gz + + ia upload doaj_data_2022-07-19 doaj_journal_data_2022-07-19_all.json.gz doaj_article_data_2022-07-19_all.json.gz + +On fatcat machine: + + cd /srv/fatcat/datasets + wget https://archive.org/download/doaj_data_2022-07-19/doaj_article_data_2022-07-19_all.json.gz + +## Prod Article Import + + git rev: 582495f66e5e08b6e257360097807711e53008d4 + (includes DOAJ container-id required patch) + + date: Tue Jul 19 22:46:42 UTC 2022 + + `doaj_id:*`: 1,335,195 hits + +Start with sample: + + zcat /srv/fatcat/datasets/doaj_article_data_2022-07-19_all.json.gz | shuf -n1000 > /srv/fatcat/datasets/doaj_article_data_2022-07-19_sample.json + + export FATCAT_AUTH_WORKER_DOAJ=[...] + cat /srv/fatcat/datasets/doaj_article_data_2022-07-19_sample.json | pv -l | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + # Counter({'total': 1000, 'exists': 895, 'exists-fuzzy': 93, 'insert': 9, 'skip': 3, 'skip-no-container': 3, 'update': 0}) + +Pretty few imports. + +Full ingest: + + export FATCAT_AUTH_WORKER_DOAJ=[...] + zcat /srv/fatcat/datasets/doaj_article_data_2022-07-19_all.json.gz | pv -l | parallel -j6 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + # Counter({'total': 1282908, 'exists': 1145439, 'exists-fuzzy': 117120, 'insert': 16357, 'skip': 3831, 'skip-no-container': 2641, 'skip-title': 1190, 'skip-doaj-id-mismatch': 161, 'update': 0}) + +Times 6x, around 100k releases added. + +Got a bunch of: + + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=fcdb7a7a9729403d8d99a21f6970dd1d ident=wesvmjwihvblzayfmrvvgr4ulm + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=1455dfe24583480883dbbb293a4bc0c6 ident=lfw57esesjbotms3grvvods5dq + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=88fa65a33c8e484091fc76f4cda59c25 ident=22abqt5qe5e7ngjd5fkyvzyc4q + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=eb7b03dc3dc340cea36891a68a50cce7 ident=ljedohlfyzdkxebgpcswjtd77q + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/doaj_article.py:233: UserWarning: unexpected DOAJ ext_id match after lookup failed doaj=519617147ce248ea88d45ab098342153 ident=a63bqkttrbhyxavfr7li2w2xf4 + +Should investigate! + +Also, noticed that DOAJ importer is hitting `api.fatcat.wiki`, not the public +API endpoint. Guessing this is via fuzzycat. + +1,434,266 results for `doaj_id:*`. + +Then did a follow-up sandcrawler ingest, see notes in that repository. Note +that newer ingest can crawl doaj.org, bypassing the sandcrawler SQL load, but +the direct crawling is probably still faster. diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md index f7b9e536..3c7be454 100644 --- a/extra/bulk_edits/CHANGELOG.md +++ b/extra/bulk_edits/CHANGELOG.md @@ -16,6 +16,21 @@ Ran a journal-level metadata update, using chocula. Cleaned up just under 500 releases with missing `container_id` from an older DOAJ article import. +Imported roughly 100k releases from DOAJ, new since 2022-04. + +Imported roughly 2.7 million new ORCiD `creator` entities, using the 2021 dump +(first update since 2020 dump). + +Imported almost 1 million new DOI release entities from JALC, first update in +more than a year. + +Imported at least 400 new dblp containers, and an unknown number of new dblp +release entities. + +Cleaned up about a thousand containers with incorrect `publisher_type`, based +on current publisher name. Further updates will populate after the next chocula +import. + ## 2022-04 diff --git a/extra/cleanups/container_publisher_type.md b/extra/cleanups/container_publisher_type.md new file mode 100644 index 00000000..dba800d3 --- /dev/null +++ b/extra/cleanups/container_publisher_type.md @@ -0,0 +1,100 @@ + +A bunch of MDPI journals are incorrectly listed as 'longtail'. + + fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --count + # 245 + +Because this is 'extra' metadata, need a little python script to change the +metadata (fatcat-cli doesn't have this feature yet): + + import sys + import json + + publisher_type = sys.argv[1].strip().lower() + #print(publisher_type, file=sys.stderr) + + for line in sys.stdin: + if not line.strip(): + continue + container = json.loads(line) + container["extra"]["publisher_type"] = publisher_type + print(json.dumps(container)) + +Run some cleanups: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --entity-json --limit 50 \ + | jq 'select(.publisher_type != "oa")' -c \ + | python3 ./container_publisher_type.py oa \ + | fatcat-cli batch update container --description "Update container publisher_type" + # editgroup_oum6mnkl2rbn3jaua4a2gdlj5q + +Looks good, run the rest: + + fatcat-cli search container 'publisher:mdpi publisher_type:* !publisher_type:oa' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "oa")' -c \ + | python3 ./container_publisher_type.py oa \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + +Some more cleanup patterns: + + fatcat-cli search container 'publisher:"Frontiers Media SA" publisher_type:* !publisher_type:oa' --count + # 84 + + fatcat-cli search container 'publisher:"Frontiers Media SA" publisher_type:* !publisher_type:oa' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "oa")' -c \ + | python3 ./container_publisher_type.py oa \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"Walter de Gruyter" publisher_type:* !publisher_type:commercial !publisher_type:archive' --count + # 47 + + fatcat-cli search container 'publisher:"Walter de Gruyter" publisher_type:* !publisher_type:commercial !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "commercial")' -c \ + | python3 ./container_publisher_type.py commercial \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"springer" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 56 + + fatcat-cli search container 'publisher:"springer" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"elsevier" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 98 + + fatcat-cli search container 'publisher:"elsevier" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:"wiley" publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 37 + + fatcat-cli search container 'publisher:"wiley" publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:taylor publisher:francis publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 558 + + fatcat-cli search container 'publisher:taylor publisher:francis publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + + fatcat-cli search container 'publisher:sage publisher_type:* !publisher_type:big5 !publisher_type:archive' --count + # 28 + + fatcat-cli search container 'publisher:sage publisher_type:* !publisher_type:big5 !publisher_type:archive' --entity-json --limit 300 \ + | jq 'select(.publisher_type != "big5")' -c \ + | python3 ./container_publisher_type.py big5 \ + | fatcat-cli batch update container --description "Update container publisher_type" --auto-accept + +Overall, around a thousand containers updated. Changes to releases will not be +reflected until they are re-indexed. diff --git a/extra/stats/2022-07-14-prod-stats.json b/extra/stats/2022-07-14-prod-stats.json new file mode 100644 index 00000000..62d06606 --- /dev/null +++ b/extra/stats/2022-07-14-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":6036957,"timestamp":"2022-07-14T18:53:18.228827+00:00"}},"container":{"total":193300},"papers":{"in_kbart":78102604,"in_web":36247601,"in_web_not_kbart":18551021,"is_oa":25281045,"total":128995907},"release":{"refs_total":1340195856,"total":184966214}} diff --git a/extra/stats/2022-07-14-prod-table-sizes.txt b/extra/stats/2022-07-14-prod-table-sizes.txt new file mode 100644 index 00000000..b4fae69a --- /dev/null +++ b/extra/stats/2022-07-14-prod-table-sizes.txt @@ -0,0 +1,47 @@ +PostgreSQL 13.5 - wbgrp-svc502.us.archive.org +Size: 735.11G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 88 GB | 32 GB | 121 GB + "public"."refs_blob" | 119 GB | 2200 MB | 121 GB + "public"."release_rev" | 85 GB | 25 GB | 110 GB + "public"."file_rev" | 36 GB | 29 GB | 65 GB + "public"."release_edit" | 18 GB | 21 GB | 39 GB + "public"."file_rev_url" | 31 GB | 8106 MB | 39 GB + "public"."abstracts" | 35 GB | 3671 MB | 39 GB + "public"."work_edit" | 17 GB | 20 GB | 37 GB + "public"."file_edit" | 18 GB | 16 GB | 34 GB + "public"."release_ident" | 12 GB | 12 GB | 23 GB + "public"."work_ident" | 12 GB | 11 GB | 23 GB + "public"."file_rev_release" | 8975 MB | 10 GB | 19 GB + "public"."file_ident" | 7775 MB | 7615 MB | 15 GB + "public"."work_rev" | 7753 MB | 5238 MB | 13 GB + "public"."release_ref" | 6721 MB | 5662 MB | 12 GB + "public"."release_rev_abstract" | 5035 MB | 7250 MB | 12 GB + "public"."webcapture_rev_cdx" | 4341 MB | 419 MB | 4760 MB + "public"."creator_edit" | 934 MB | 1042 MB | 1976 MB + "public"."creator_rev" | 928 MB | 730 MB | 1658 MB + "public"."editgroup" | 1294 MB | 256 MB | 1550 MB + "public"."creator_ident" | 631 MB | 647 MB | 1277 MB + "public"."release_rev_extid" | 524 MB | 649 MB | 1173 MB + "public"."changelog" | 383 MB | 301 MB | 685 MB + "public"."container_rev" | 249 MB | 60 MB | 308 MB + "public"."webcapture_edit" | 82 MB | 53 MB | 135 MB + "public"."container_edit" | 63 MB | 69 MB | 132 MB + "public"."webcapture_rev_url" | 65 MB | 22 MB | 87 MB + "public"."webcapture_rev_release" | 24 MB | 35 MB | 59 MB + "public"."webcapture_rev" | 45 MB | 14 MB | 59 MB + "public"."webcapture_ident" | 27 MB | 27 MB | 54 MB + "public"."container_ident" | 13 MB | 20 MB | 34 MB + "public"."auth_oidc" | 104 kB | 160 kB | 264 kB + "public"."editor" | 96 kB | 160 kB | 256 kB + "public"."editgroup_annotation" | 80 kB | 48 kB | 128 kB + "public"."fileset_rev_file" | 88 kB | 32 kB | 120 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) |