diff options
-rw-r--r-- | notes/bulk_edits/2021-11-10_case_sensitive_dois.md | 53 | ||||
-rw-r--r-- | notes/bulk_edits/2021-11-10_file_release_ingest_bugfix.md | 108 | ||||
-rw-r--r-- | notes/bulk_edits/2021-11-11_wayback_short_ts.md | 52 | ||||
-rw-r--r-- | notes/bulk_edits/CHANGELOG.md | 18 | ||||
-rw-r--r-- | notes/cleanups/wayback_timestamps.md | 13 |
5 files changed, 244 insertions, 0 deletions
diff --git a/notes/bulk_edits/2021-11-10_case_sensitive_dois.md b/notes/bulk_edits/2021-11-10_case_sensitive_dois.md new file mode 100644 index 00000000..20772f56 --- /dev/null +++ b/notes/bulk_edits/2021-11-10_case_sensitive_dois.md @@ -0,0 +1,53 @@ + +## Production Run + +Start small: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + + wc -l /srv/fatcat/datasets/nonlowercase_doi_releases.tsv + # 140530 + + head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.tsv \ + | python -m fatcat_tools.cleanups.release_lowercase_doi - + # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0}) + + # same command again to test not duping updates + Counter({'total': 100, 'skip-existing-doi-fine': 100, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0}) + + # example editgroup_cld5qe34bzg7xg7g4cz5skgaw4 + +Database size just before, while some other edits happening, PostgreSQL 11.6: 762.66G + +Ok, run a bunch in parallel: + + cat /srv/fatcat/datasets/nonlowercase_doi_releases.tsv \ + | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.release_lowercase_doi - + # Counter({'total': 24022, 'update': 24022, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 38836, 'update': 38836, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 38836, 'update': 38836, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 38836, 'update': 38736, 'skip-existing-doi-fine': 100, 'skip': 0, 'insert': 0, 'exists': 0}) + +Over 3k TPS in `pg_activity`. + +Should have included `pv -l` in the pipeline. + +Final database size 763.14G, so only a couple hundred MByte of growth, totally +fine. + + +## Verification + +Re-dump release extids, in production: + + sudo -u postgres psql fatcat_prod < dump_release_extid.sql | egrep -v ^BEGIN$ | egrep -v ^ROLLBACK$ | pv -l | pigz > /srv/fatcat/snapshots/release_extid.tsv.gz + +Filter to non-lowercase DOIs: + + zcat release_extid.tsv.gz \ + | cut -f1,3 \ + | rg '[A-Z]' \ + | pv -l \ + > nonlowercase_doi.tsv + +Zero returned, hurray! diff --git a/notes/bulk_edits/2021-11-10_file_release_ingest_bugfix.md b/notes/bulk_edits/2021-11-10_file_release_ingest_bugfix.md new file mode 100644 index 00000000..6b5deb63 --- /dev/null +++ b/notes/bulk_edits/2021-11-10_file_release_ingest_bugfix.md @@ -0,0 +1,108 @@ + +## Production Run + +Start small: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + + wc -l /srv/fatcat/datasets/file_release_bugfix_20211105.json + 228826 + + head -n100 /srv/fatcat/datasets/file_release_bugfix_20211105.json \ + | python -m fatcat_tools.cleanups.file_release_bugfix - + # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0}) + + # example editgroup_keae3rfekffuriiy77f26rf6uq + +These are all now stubs (no release associated), which isn't the ratio seen in QA. Going to do a random sample: + + shuf -n100 /srv/fatcat/datasets/file_release_bugfix_20211105.json \ + | python -m fatcat_tools.cleanups.file_release_bugfix - + # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0}) + + # example editgroup_34mk525kxvdu3hak7g7fr7awru + +Looked at a few and all looked more like what would be expected, correct matches. + +Comparing before and after counts is going to be tricky, and will require a +full re-index for an accurate count. But did do a snapshot just before this run +(2021-11-10-prod-stats.json), and got 31,110,184 `in_web`. + +Full edit, in parallel: + + cat /srv/fatcat/datasets/file_release_bugfix_20211105.json \ + | pv -l \ + | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_release_bugfix - + # 228k 0:26:34 [ 143 /s] + # Counter({'total': 26090, 'update': 26071, 'skip-existing-fixed': 15, 'skip': 4, 'skip-wrong-release-is-ok': 4, 'insert': 0, 'exists': 0}) + # Counter({'total': 26080, 'update': 26061, 'skip-existing-fixed': 12, 'skip': 7, 'skip-wrong-release-is-ok': 7, 'insert': 0, 'exists': 0}) + # Counter({'total': 27517, 'update': 27497, 'skip-existing-fixed': 15, 'skip': 5, 'skip-wrong-release-is-ok': 5, 'insert': 0, 'exists': 0}) + # Counter({'total': 29534, 'update': 29420, 'skip-existing-fixed': 110, 'skip': 4, 'skip-wrong-release-is-ok': 4, 'insert': 0, 'exists': 0}) + # Counter({'total': 29544, 'update': 29517, 'skip-existing-fixed': 16, 'skip': 11, 'skip-wrong-release-is-ok': 11, 'insert': 0, 'exists': 0}) + # Counter({'total': 29535, 'update': 29518, 'skip-existing-fixed': 10, 'skip': 7, 'skip-wrong-release-is-ok': 7, 'insert': 0, 'exists': 0}) + # Counter({'total': 30082, 'update': 30065, 'skip-existing-fixed': 13, 'skip': 4, 'skip-wrong-release-is-ok': 4, 'insert': 0, 'exists': 0}) + # Counter({'total': 30444, 'update': 30420, 'skip-existing-fixed': 21, 'skip': 3, 'skip-wrong-release-is-ok': 3, 'insert': 0, 'exists': 0}) + +## Verification + +Counts: + + SELECT file_edit.extra_json->>'ingest_request_source' as source, COUNT(*) as broken_files + FROM file_edit + LEFT JOIN file_ident ON file_edit.ident_id = file_ident.id + LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev + LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id + LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id + WHERE + file_edit.extra_json->>'link_source_id' IS NOT NULL + AND file_edit.extra_json->>'link_source_id' LIKE '10.%' + AND lower(release_rev.doi) != lower(file_edit.extra_json->>'link_source_id') + AND file_ident.rev_id = file_edit.rev_id + GROUP BY file_edit.extra_json->>'ingest_request_source'; + + source | broken_files + -----------+-------------- + unpaywall | 233 + (1 row) + +Examples: + + SELECT file_edit.ident_id as file_ident, release_ident.id as release_ident, file_edit.extra_json->>'link_source_id' as file_edit_doi, release_rev.doi as release_doi + FROM file_edit + LEFT JOIN file_ident ON file_edit.ident_id = file_ident.id + LEFT JOIN file_rev_release ON file_edit.rev_id = file_rev_release.file_rev + LEFT JOIN release_ident ON file_rev_release.target_release_ident_id = release_ident.id + LEFT JOIN release_rev ON release_rev.id = release_ident.rev_id + WHERE + file_edit.extra_json->>'link_source_id' IS NOT NULL + AND file_edit.extra_json->>'link_source_id' LIKE '10.%' + AND lower(release_rev.doi) != lower(file_edit.extra_json->>'link_source_id') + AND file_ident.rev_id = file_edit.rev_id + LIMIT 20; + + +Looks like many of the remaining mismatches are from "double-slash" normalization, with doi prefix 10.1037: + + file_ident | release_ident | file_edit_doi | release_doi + --------------------------------------+--------------------------------------+------------------------------+------------------------------ + ae2f7864-66a6-4a82-a0e6-153cb4d0b03a | 0f436ae6-d7b4-4a45-a434-d158bc4a3437 | 10.1037/0096-1523.25.6.1568 | 10.1037//0096-1523.25.6.1568 + d02ff5ab-a882-4a86-8a94-ce6222708323 | 2d5ebbca-e4ba-4bb7-bb19-f1e081479eab | 10.1037//0021-9010.63.4.467 | 10.1037/0021-9010.63.4.467 + 2c107387-f57f-4855-bc1a-e40704f1e9b4 | 7654b956-4776-4f6f-bc35-ccf7e6bfe99c | 10.1037/0022-0663.75.4.500 | 10.1037//0022-0663.75.4.500 + 15e3636a-4bcf-4595-8a2a-b6b06a299a2f | c09e3531-1ac4-4bfa-9fcf-8acb9f0d845e | 10.1037//1064-1297.8.2.225 | 10.1037/1064-1297.8.2.225 + dc8b86c8-9b8e-4333-abbb-8811010d9c71 | bd91e7be-c360-47af-a634-f048e2c85b73 | 10.1037//0021-843x.105.4.637 | 10.1037/0021-843x.105.4.637 + 35a06e0a-6f72-4624-87ca-fbb74bc9d77d | 96befa26-6eb0-47c0-a0ec-e00282e33bff | 10.1037//0735-7044.99.5.964 | 10.1037/0735-7044.99.5.964 + 707bfaa1-65de-4dbb-9786-51b99d03d91d | 2d58524b-4216-4092-8ddf-336ac42d5955 | 10.1037/0096-1523.28.3.515 | 10.1037//0096-1523.28.3.515 + de9ea98f-672e-44ec-9d12-e11acd8990d0 | 20f1a857-ad51-4b80-9ce5-bc3a44df96b1 | 10.1037//0002-9432.71.1.72 | 10.1037/0002-9432.71.1.72 + 4275306c-11ef-4fce-bc03-3f1efe99f9a6 | c69bc740-4da1-4f96-acc9-151a0cef5c3f | 10.1037//1064-1297.6.1.107 | 10.1037/1064-1297.6.1.107 + 6a63d2ae-b953-48ba-a68a-061543d82ad4 | e3c8b8c1-defc-44ac-8c73-e21e8cf93f5c | 10.1037//0022-0167.23.6.557 | 10.1037/0022-0167.23.6.557 + 2fcbb54e-8fa8-4bbc-a2ae-4b6b6eaff412 | 8500b4a5-a693-4415-b4a4-4dcfb3403d82 | 10.1037/0021-9010.73.1.68 | 10.1037//0021-9010.73.1.68 + b9aa4601-4a1b-4146-aa6b-a410d0fc3dce | 954f2072-8c53-41c7-82b0-8c6fe9ef4d0c | 10.1037//0278-6133.13.4.319 | 10.1037/0278-6133.13.4.319 + b528b924-0680-43f3-81ad-d822e51b3373 | 69387969-40bc-451d-b567-8713296f60b0 | 10.1037//0002-9432.71.1.38 | 10.1037/0002-9432.71.1.38 + f64f1ee2-b787-4a06-87ab-46b94f9d5454 | c082d47f-175d-456a-a741-650b5eaa5173 | 10.1037//0021-843x.98.4.487 | 10.1037/0021-843x.98.4.487 + 86e8b655-963a-4c11-ae70-a8d528400682 | 6381254c-e339-4354-b0d8-711b5b5e4fcc | 10.1037/0022-0663.89.1.183 | 10.1037//0022-0663.89.1.183 + 716e1761-8120-480b-b096-e7698c65456a | 7a4d2c7d-32b7-4292-adbf-b791387a3ac5 | 10.1037//0278-7393.21.5.1209 | 10.1037/0278-7393.21.5.1209 + bb7aa131-d5e5-497e-8040-b1729850b94c | 1ce29f33-d020-4d5f-a8b3-2b8bef53ccb8 | 10.1037//1040-3590.7.4.533 | 10.1037/1040-3590.7.4.533 + 510ad392-43aa-42dc-9644-9697f425efd5 | 796c92ca-767a-495b-ad0c-f458381c071c | 10.1037//0278-7393.20.4.824 | 10.1037/0278-7393.20.4.824 + 32c57e68-0793-4ded-bca2-d05f3532ff3e | 1567a003-0b5a-48bb-bb7c-7dff2c44b90b | 10.1037//1040-3590.13.1.59 | 10.1037/1040-3590.13.1.59 + e0d1fd38-17d8-42ac-b9df-60312829ddd4 | 0cefaf5d-fcdf-4049-a9d7-0c569096478e | 10.1037//0022-006x.56.4.621 | 10.1037/0022-006x.56.4.621 + (20 rows) diff --git a/notes/bulk_edits/2021-11-11_wayback_short_ts.md b/notes/bulk_edits/2021-11-11_wayback_short_ts.md new file mode 100644 index 00000000..20349f0c --- /dev/null +++ b/notes/bulk_edits/2021-11-11_wayback_short_ts.md @@ -0,0 +1,52 @@ + +## Production Run + +At git commit `6ad9d24e4d7d901d6fc394e6e91575f6acba7ff4`. + +Start small: + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + + zcat /srv/fatcat/datasets/files_20211105_moreshortts.fetched.json.gz \ + | head -n100 \ + | python -m fatcat_tools.cleanups.file_short_wayback_ts - + # Counter({'total': 100, 'update': 99, 'skip-bad-wayback-timestamp': 1, 'skip': 0, 'insert': 0, 'exists': 0}) + +Looks good! Run the full batch. + + zcat /srv/fatcat/datasets/files_20211105_moreshortts.fetched.json.gz \ + | pv -l \ + | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_short_wayback_ts - + + [...] + bad replacement URL: partial_ts=2017 original=https://www.hydrol-earth-syst-sci.net/21/4959/2017/hess-21-4959-2017.pdf fix_url=https://web.archive.org/web/20180721004954/https://www.hydrol-earth-syst-sci.net/21/4959/2017/hess-21-4959-2017.pdf + bad replacement URL: partial_ts=2017 original=https://www.the-cryosphere.net/11/1537/2017/tc-11-1537-2017.pdf fix_url=https://web.archive.org/web/20180719235703/https://www.the-cryosphere.net/11/1537/2017/tc-11-1537-2017.pdf + bad replacement URL: partial_ts=2017 original=http://www.growingscience.com/msl/Vol7/msl_2017_26.pdf fix_url=https://web.archive.org/web/20180601235059/http://www.growingscience.com/msl/Vol7/msl_2017_26.pdf + bad replacement URL: partial_ts=2017 original=https://www.hydrol-earth-syst-sci.net/21/4115/2017/hess-21-4115-2017.pdf fix_url=https://web.archive.org/web/20180719162956/https://www.hydrol-earth-syst-sci.net/21/4115/2017/hess-21-4115-2017.pdf + bad replacement URL: partial_ts=2017 original=https://www.biogeosciences.net/14/4279/2017/bg-14-4279-2017.pdf fix_url=https://web.archive.org/web/20180720220056/https://www.biogeosciences.net/14/4279/2017/bg-14-4279-2017.pdf + bad replacement URL: partial_ts=2017 original=https://www.biogeosciences.net/14/3669/2017/bg-14-3669-2017.pdf fix_url=https://web.archive.org/web/20180720222828/https://www.biogeosciences.net/14/3669/2017/bg-14-3669-2017.pdf + [...] + bad replacement URL: partial_ts=2017 original=http://www.growingscience.com/msl/Vol7/msl_2017_28.pdf fix_url=https://web.archive.org/web/20180602071632/http://www.growingscience.com/msl/Vol7/msl_2017_28.pdf + bad replacement URL: partial_ts=2017 original=https://www.biogeosciences.net/14/4161/2017/bg-14-4161-2017.pdf fix_url=https://web.archive.org/web/20180720004438/https://www.biogeosciences.net/14/4161/2017/bg-14-4161-2017.pdf + bad replacement URL: partial_ts=2017 original=https://core.ac.uk/download/pdf/10915563.pdf fix_url=https://web.archive.org/web/20190220174144/https://core.ac.uk/download/pdf/10915563.pdf + bad replacement URL: partial_ts=2017 original=http://www.growingscience.com/ijiec/Vol9/IJIEC_2017_24.pdf fix_url=https://web.archive.org/web/20180602094300/http://www.growingscience.com/ijiec/Vol9/IJIEC_2017_24.pdf + bad replacement URL: partial_ts=2017 original=https://core.ac.uk/download/pdf/36046645.pdf fix_url=https://web.archive.org/web/20190220175351/https://core.ac.uk/download/pdf/36046645.pdf + bad replacement URL: partial_ts=2017 original=https://core.ac.uk/download/pdf/35085886.pdf fix_url=https://web.archive.org/web/20190220175410/https://core.ac.uk/download/pdf/35085886.pdf + bad replacement URL: partial_ts=2017 original=https://www.atmos-chem-phys.net/17/10349/2017/acp-17-10349-2017.pdf fix_url=https://web.archive.org/web/20181102190649/https://www.atmos-chem-phys.net/17/10349/2017/acp-17-10349-2017.pdf + bad replacement URL: partial_ts=2017 original=https://www.atmos-chem-phys.net/17/7775/2017/acp-17-7775-2017.pdf fix_url=https://web.archive.org/web/20181101041355/https://www.atmos-chem-phys.net/17/7775/2017/acp-17-7775-2017.pdf + bad replacement URL: partial_ts=2017 original=http://www.veterinaryworld.org/Vol.10/March-2017/5.pdf fix_url=https://web.archive.org/web/20180721074940/http://www.veterinaryworld.org/Vol.10/March-2017/5.pdf + bad replacement URL: partial_ts=2017 original=https://www.ann-geophys.net/35/189/2017/angeo-35-189-2017.pdf fix_url=https://web.archive.org/web/20180625214916/https://www.ann-geophys.net/35/189/2017/angeo-35-189-2017.pdf + [...] + + # 9.96M 12:57:06 [ 213 /s] + + Counter({'total': 1272301, 'update': 1268466, 'skip-bad-wayback-timestamp': 2808, 'skip': 1026, 'skip-status': 981, 'skip-bad-replacement': 45, 'skip-bad-wayback': 1, 'insert': 0, 'exists': 0}) + Counter({'total': 1242814, 'update': 1239042, 'skip-bad-wayback-timestamp': 2734, 'skip': 1036, 'skip-status': 974, 'skip-bad-replacement': 62, 'skip-bad-wayback': 2, 'insert': 0, 'exists': 0}) + Counter({'total': 1264351, 'update': 1260695, 'skip-bad-wayback-timestamp': 2626, 'skip': 1030, 'skip-status': 977, 'skip-bad-replacement': 53, 'insert': 0, 'exists': 0}) + Counter({'total': 1244480, 'update': 1240779, 'skip-bad-wayback-timestamp': 2680, 'skip': 1020, 'skip-status': 962, 'skip-bad-replacement': 58, 'skip-bad-wayback': 1, 'insert': 0, 'exists': 0}) + Counter({'total': 1222678, 'update': 1219022, 'skip-bad-wayback-timestamp': 2698, 'skip': 956, 'skip-status': 892, 'skip-bad-replacement': 64, 'skip-bad-wayback': 2, 'insert': 0, 'exists': 0}) + Counter({'total': 1225078, 'update': 1221459, 'skip-bad-wayback-timestamp': 2597, 'skip': 1020, 'skip-status': 964, 'skip-bad-replacement': 56, 'skip-bad-wayback': 2, 'insert': 0, 'exists': 0}) + Counter({'total': 1283843, 'update': 1280014, 'skip-bad-wayback-timestamp': 2670, 'skip': 1059, 'skip-status': 997, 'skip-revision-changed': 99, 'skip-bad-replacement': 62, 'skip-bad-wayback': 1, 'insert': 0, 'exists': 0}) + Counter({'total': 1203309, 'update': 1199782, 'skip-bad-wayback-timestamp': 2556, 'skip': 971, 'skip-status': 923, 'skip-bad-replacement': 48, 'insert': 0, 'exists': 0}) + +On the order of 99.7% were updated/fixed, over 9.5 million file entities, taking almost 13 hours. diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md index ed989c41..d82e126e 100644 --- a/notes/bulk_edits/CHANGELOG.md +++ b/notes/bulk_edits/CHANGELOG.md @@ -9,6 +9,24 @@ this file should probably get merged into the guide at some point. This file should not turn in to a TODO list! + +## 2021-11 + +Ran a series of cleanups. See background and prep notes in `notes/cleanups/` +and specific final commands in this directory. Quick summary: + +- more than 9.5 million file entities had truncated timestamps wayback URLs, + and were fixed with the full timestamps. there are still a small fraction + (0.5%) which were identified but not corrected in this first pass +- over 140k release entities with non-lowercase DOIs were updated with + lowercase DOI. all DOIs in current release entities now lowercase (at least, + no ASCII uppercase characters found) +- over 220k file entities with incorrect release relation, due to an + import-time code bug, were fixed. a couple hundred questionable cases remain, + but are all mismatched due to DOI slash/double-slash issues and will not be + fixed in an automated way. + + ## 2021-06 Created new containers via chocula pipeline. Did not update any existing diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md index 38bc3f7d..e3ea942d 100644 --- a/notes/cleanups/wayback_timestamps.md +++ b/notes/cleanups/wayback_timestamps.md @@ -289,3 +289,16 @@ Ugh. | pv -l \ | gzip \ > files_20211105_moreshortts.fetched.json.gz + # 9.96M 6:38:43 [ 416 /s] + +Looks like the last small tweak was successful! This was with git commit +`cd09c6d6bd4deef0627de4f8a8a301725db01e14`. + + + zcat files_20211105_moreshortts.fetched.json.gz | jq .status | sort | uniq -c | sort -nr + 6228307 "success-db" + 2876033 "success-self" + 846844 "success-api" + 7583 "fail-not-found" + 87 "fail-cdx-403" + |