diff options
-rw-r--r-- | extra/bulk_edits/2021-12-06_file_meta.md | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/extra/bulk_edits/2021-12-06_file_meta.md b/extra/bulk_edits/2021-12-06_file_meta.md new file mode 100644 index 00000000..37c9f43c --- /dev/null +++ b/extra/bulk_edits/2021-12-06_file_meta.md @@ -0,0 +1,60 @@ + +Another partial batch of pure `file_meta` updates to file entities. These came +from new purpose-specific wayback CDX fetch script, then running pdfextract +worker. + +See cleanups `file_meta` document for prep details. + + +## Production Commands + + git log | head -n1 + # commit 9ad198b7a1e12504e3d4718e56edd0a7e8b5e61a + + export FATCAT_API_AUTH_TOKEN=[...] # sandcrawler-bot + +Start with a small sample: + + zcat /srv/fatcat/datasets/files_missing_sha256.file_meta.json.gz \ + | head -n100 \ + | ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + # Counter({'total': 100, 'update': 94, 'skip-existing-complete': 6, 'skip': 0, 'insert': 0, 'exists': 0}) + +Looks good, so run full batch in parallel: + + zcat /srv/fatcat/datasets/files_missing_sha256.file_meta.json.gz | wc -l + # 345,860 + + zcat /srv/fatcat/datasets/files_missing_sha256.file_meta.json.gz \ + | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + + # HTTP response body: {"success":false,"error":"ConstraintViolation","message":"unexpected database error: duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""} + + Counter({'total': 32302, 'update': 28080, 'skip-existing-complete': 3112, 'skip-no-match': 1110, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 32302, 'update': 27995, 'skip-existing-complete': 3263, 'skip-no-match': 1044, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 32303, 'update': 27997, 'skip-existing-complete': 3225, 'skip-no-match': 1081, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 32302, 'update': 28008, 'skip-existing-complete': 3218, 'skip-no-match': 1076, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 32301, 'update': 28084, 'skip-existing-complete': 3154, 'skip-no-match': 1063, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 32302, 'update': 27890, 'skip-existing-complete': 3378, 'skip-no-match': 1034, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 32301, 'update': 28095, 'skip-existing-complete': 3216, 'skip-no-match': 990, 'skip': 0, 'insert': 0, 'exists': 0}) + +So roughly 224k updated out of ~258k attempted. Going to try again, with a shuffle, to see if more are updated: + + zcat /srv/fatcat/datasets/files_missing_sha256.file_meta.json.gz \ + | shuf \ + | pv -l \ + | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + + # 345k 0:08:16 [ 697 /s] + + Counter({'total': 41532, 'skip-existing-complete': 31130, 'update': 9027, 'skip-no-match': 1375, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 41529, 'skip-existing-complete': 31057, 'update': 9096, 'skip-no-match': 1376, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 41529, 'skip-existing-complete': 31015, 'update': 9190, 'skip-no-match': 1324, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 41530, 'skip-existing-complete': 31074, 'update': 9121, 'skip-no-match': 1335, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 41533, 'skip-existing-complete': 31163, 'update': 9049, 'skip-no-match': 1321, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 46145, 'skip-existing-complete': 34505, 'update': 10175, 'skip-no-match': 1465, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 46146, 'skip-existing-complete': 34551, 'update': 10101, 'skip-no-match': 1494, 'skip': 0, 'insert': 0, 'exists': 0}) + Counter({'total': 45916, 'skip-existing-complete': 34530, 'update': 9878, 'skip-no-match': 1508, 'skip': 0, 'insert': 0, 'exists': 0}) + +Another ~80k, so ~300k total. Good progress, though still at least a few tens +of thousands of these to track down and update (or just remove/delete). |