From eb60449cdc9614ec7eda79b8481d1d8487b9a5f6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 19:58:20 -0800 Subject: notes on file_meta partial cleanup --- notes/bulk_edits/2021-11-24_file_meta.md | 41 ++++++++++++++++++++++++++++++++ notes/bulk_edits/CHANGELOG.md | 2 ++ 2 files changed, 43 insertions(+) create mode 100644 notes/bulk_edits/2021-11-24_file_meta.md (limited to 'notes/bulk_edits') diff --git a/notes/bulk_edits/2021-11-24_file_meta.md b/notes/bulk_edits/2021-11-24_file_meta.md new file mode 100644 index 00000000..1ec1698b --- /dev/null +++ b/notes/bulk_edits/2021-11-24_file_meta.md @@ -0,0 +1,41 @@ + +Another partial batch of pure `file_meta` updates to file entities. These came +from re-attempting ingest by URL of existing file entities. + +Not all ran as expected, partially because of GROBID issues, and partially +because we had alternate captures for the same URLs. + +Still, about half the attempts worked, so we are going to update a fraction of +the ~520k outstanding file entities with partial metadata (eg, missing sha256). + +See cleanups `file_meta` document for prep and QA testing notes. + + +## Production Commands + + git log | head -n1 + commit 75bde4ad3970e8e63b04009cfd16ed4b9a924ce7 + + export export FATCAT_AUTH_API_TOKEN=[...] # sandcrawler-bot + +Start with a small sample: + + cat /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.sample.json \ + | ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + # Counter({'total': 100, 'skip-existing-complete': 45, 'update': 43, 'skip-no-match': 12, 'skip': 0, 'insert': 0, 'exists': 0}) + +Then run in parallel with full batch: + + cat /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.json \ + | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + # Counter({'total': 41846, 'update': 19737, 'skip-existing-complete': 18788, 'skip-no-match': 3321, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41522, 'update': 19678, 'skip-existing-complete': 18607, 'skip-no-match': 3237, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41537, 'update': 20517, 'skip-existing-complete': 17895, 'skip-no-match': 3125, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41529, 'update': 19684, 'skip-existing-complete': 18501, 'skip-no-match': 3344, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41530, 'update': 19595, 'skip-existing-complete': 18637, 'skip-no-match': 3298, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41542, 'update': 21359, 'skip-existing-complete': 17033, 'skip-no-match': 3150, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41534, 'update': 19758, 'skip-existing-complete': 18516, 'skip-no-match': 3260, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41537, 'update': 20507, 'skip-existing-complete': 15543, 'skip-no-match': 5487, 'skip': 0, 'insert': 0, 'exists': 0}) + +Import ran pretty fast! Updated about 160k file entities. More like 1/3 than +1/2 of the 520k that were missing SHA-256. diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md index cf5d9829..6156721c 100644 --- a/notes/bulk_edits/CHANGELOG.md +++ b/notes/bulk_edits/CHANGELOG.md @@ -26,6 +26,8 @@ and specific final commands in this directory. Quick summary: but are all mismatched due to DOI slash/double-slash issues and will not be fixed in an automated way. - de-uplicated a few thousand file entities, on the basis of SHA-1 hash +- updated file metadata for around 160k file entities (a couple hundred + thousand remain with partial metadata) ## 2021-06 -- cgit v1.2.3