From ca3aedcefd5c3835b6fa9b685200ef12d635d22c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 9 Nov 2021 21:12:54 -0800 Subject: update cleanups notes --- notes/cleanups/file_release_ingest_bug.md | 48 +++++++++++++++++++++++++++++++ notes/cleanups/wayback_timestamps.md | 24 ++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/notes/cleanups/file_release_ingest_bug.md b/notes/cleanups/file_release_ingest_bug.md index 8690157a..d818905b 100644 --- a/notes/cleanups/file_release_ingest_bug.md +++ b/notes/cleanups/file_release_ingest_bug.md @@ -142,3 +142,51 @@ And create a sample file: shuf -n10000 /srv/fatcat/snapshots/file_release_bugfix_20211105.json > /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json + +## Testing in QA + + export FATCAT_AUTH_WORKER_CLEANUP=[...] + + head -n10 /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \ + | python -m fatcat_tools.cleanups.file_release_bugfix - + # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0}) + + file_wmjzuybuorfrjdgnr2d32vg5va: was wrong, now correct, no other files + file_wgszcwehlffnzmoypr4l2yhvza: was correct, now correct, no other files. multiple articles in single PDF + file_p4r5sbminzgrhn4yaiqyr7ahwi: was correct, now wrong (!!!) + doi:10.1055/s-0036-1579844 + PDF says: 10.4103/0028-3886.158210 + unpaywall still has this incorrect linkage + file_n5jtvrnodbfdbccl5d6hshhvw4: now stub, was wrong + doi:10.19080/bboaj.2018.04.555640 release_suvtcm7hdbbr3fczcxxt4thgoi seems correct? + doi:10.19080/bboaj.4.4 is the bad DOI, not in fatcat. is registered, as an entire issue + file_kt5fv4d5lrbk7j3dxxgcm7hph4: was wrong, now correct + file_jq4juugnynhsdkh3whkcjod46q: was wrong, now correct + file_ef7w6y7k4jhjhgwmfai37yjjmm: was wrong, now correct + file_ca2svhd6knfnff4dktuonp45du: was correct, now correct. complicated, multiple DOIs/copies/files of same work + file_borst2aewvcwzhucnyth2vf3lm: was correct, now correct. complicated, multiple DOIs, single file of same work + +Overall, seems like this might not be as much of an obvious improvement as +hoped! But still progress and more correct. + + head -n1000 /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \ + | python -m fatcat_tools.cleanups.file_release_bugfix - + # Counter({'total': 1000, 'update': 929, 'skip-existing-history-updated': 58, 'skip-existing-fixed': 10, 'skip': 3, 'skip-link-source': 3, 'insert': 0, 'exists': 0}) + +Looking at `skip-link-source`, it is cases where `link_source` is 'doi' not +'fatcat-changelog'. Will update filter behavior, 'fatcat-changelog' is a +`ingest_request_source`. + +Checking another 10 examples. They all seem to end up as correct matches. + +Did a small update and running the whole batch: + + cat /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \ + | python -m fatcat_tools.cleanups.file_release_bugfix - + # Counter({'total': 10000, 'update': 8499, 'skip-existing-fixed': 939, 'skip-existing-history-updated': 560, 'skip': 2, 'skip-wrong-release-is-ok': 2, 'insert': 0, 'exists': 0}) + +I think this is ready to go! Example with parallel: + + cat /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \ + | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_release_bugfix - + diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md index 85e5f94f..38bc3f7d 100644 --- a/notes/cleanups/wayback_timestamps.md +++ b/notes/cleanups/wayback_timestamps.md @@ -265,3 +265,27 @@ Running in bulk again: | gzip \ > files_20211105_moreshortts.fetched.json.gz +Ran in to one: `requests.exceptions.HTTPError: 503 Server Error: Service +Temporarily Unavailable for url: [...]`. Will try again, if there are more +failures may need to split up in smaller chunks. + +Unexpected: + + Traceback (most recent call last): + File "./fetch_full_cdx_ts.py", line 200, in + main() + File "./fetch_full_cdx_ts.py", line 197, in main + print(json.dumps(process_file(fe, session=session))) + File "./fetch_full_cdx_ts.py", line 118, in process_file + assert seg[4].isdigit() + AssertionError + 3.96M 3:04:46 [ 357 /s] + +Ugh. + + zcat files_20211105_moreshortts.json.gz \ + | tac \ + | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \ + | pv -l \ + | gzip \ + > files_20211105_moreshortts.fetched.json.gz -- cgit v1.2.3