From eb60449cdc9614ec7eda79b8481d1d8487b9a5f6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 19:58:20 -0800 Subject: notes on file_meta partial cleanup --- notes/bulk_edits/2021-11-24_file_meta.md | 41 ++++++++ notes/bulk_edits/CHANGELOG.md | 2 + notes/cleanups/file_meta.md | 152 +++++++++++++++++++++++++++ notes/cleanups/scripts/file2ingestrequest.py | 44 ++++++++ 4 files changed, 239 insertions(+) create mode 100644 notes/bulk_edits/2021-11-24_file_meta.md create mode 100644 notes/cleanups/file_meta.md create mode 100755 notes/cleanups/scripts/file2ingestrequest.py (limited to 'notes') diff --git a/notes/bulk_edits/2021-11-24_file_meta.md b/notes/bulk_edits/2021-11-24_file_meta.md new file mode 100644 index 00000000..1ec1698b --- /dev/null +++ b/notes/bulk_edits/2021-11-24_file_meta.md @@ -0,0 +1,41 @@ + +Another partial batch of pure `file_meta` updates to file entities. These came +from re-attempting ingest by URL of existing file entities. + +Not all ran as expected, partially because of GROBID issues, and partially +because we had alternate captures for the same URLs. + +Still, about half the attempts worked, so we are going to update a fraction of +the ~520k outstanding file entities with partial metadata (eg, missing sha256). + +See cleanups `file_meta` document for prep and QA testing notes. + + +## Production Commands + + git log | head -n1 + commit 75bde4ad3970e8e63b04009cfd16ed4b9a924ce7 + + export export FATCAT_AUTH_API_TOKEN=[...] # sandcrawler-bot + +Start with a small sample: + + cat /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.sample.json \ + | ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + # Counter({'total': 100, 'skip-existing-complete': 45, 'update': 43, 'skip-no-match': 12, 'skip': 0, 'insert': 0, 'exists': 0}) + +Then run in parallel with full batch: + + cat /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.json \ + | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + # Counter({'total': 41846, 'update': 19737, 'skip-existing-complete': 18788, 'skip-no-match': 3321, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41522, 'update': 19678, 'skip-existing-complete': 18607, 'skip-no-match': 3237, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41537, 'update': 20517, 'skip-existing-complete': 17895, 'skip-no-match': 3125, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41529, 'update': 19684, 'skip-existing-complete': 18501, 'skip-no-match': 3344, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41530, 'update': 19595, 'skip-existing-complete': 18637, 'skip-no-match': 3298, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41542, 'update': 21359, 'skip-existing-complete': 17033, 'skip-no-match': 3150, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41534, 'update': 19758, 'skip-existing-complete': 18516, 'skip-no-match': 3260, 'skip': 0, 'insert': 0, 'exists': 0}) + # Counter({'total': 41537, 'update': 20507, 'skip-existing-complete': 15543, 'skip-no-match': 5487, 'skip': 0, 'insert': 0, 'exists': 0}) + +Import ran pretty fast! Updated about 160k file entities. More like 1/3 than +1/2 of the 520k that were missing SHA-256. diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md index cf5d9829..6156721c 100644 --- a/notes/bulk_edits/CHANGELOG.md +++ b/notes/bulk_edits/CHANGELOG.md @@ -26,6 +26,8 @@ and specific final commands in this directory. Quick summary: but are all mismatched due to DOI slash/double-slash issues and will not be fixed in an automated way. - de-uplicated a few thousand file entities, on the basis of SHA-1 hash +- updated file metadata for around 160k file entities (a couple hundred + thousand remain with partial metadata) ## 2021-06 diff --git a/notes/cleanups/file_meta.md b/notes/cleanups/file_meta.md new file mode 100644 index 00000000..d99e821e --- /dev/null +++ b/notes/cleanups/file_meta.md @@ -0,0 +1,152 @@ + +Over 500k file entities still lack complete metadata. For example, SHA-256 +checksums and verified mimetypes. + +Presumably these also lack GROBID processing. It seems that most or all of +these are simply wayback captures with no CDX metadata in sandcrawler-db, so +they didn't get update in prior cleanups. + +Current plan, re-using existing tools and processes, is to: + +1. create stub ingest requests containing file idents +2. process them "locally" on a large VM, in 'bulk' mode; writing output to stdout but using regular grobid and pdfextract "sinks" to Kafka +3. transform ingest results to a form for existing `file_meta` importer +4. run imports + +The `file_meta` importer requires just the `file_meta` dict from sandcrawler. + +## Prep + + zcat file_hashes.tsv.gz | pv -l | rg '\t\t' | wc -l + # 521,553 + + zcat file_export.json.gz \ + | rg -v '"sha256":' \ + | pv -l \ + | pigz \ + > files_missing_sha256.json.gz + # 521k 0:10:21 [ 839 /s] + +Want ingest requests with: + + base_url: str + ingest_type: "pdf" + link_source: "fatcat" + link_source_id: file ident (with "file_" prefix) + ingest_request_source: "file-backfill" + ext_ids: + sha1: str + +Use `file2ingestrequest.py` helper: + + zcat files_missing_sha256.json.gz \ + | ./file2ingestrequest.py \ + | pv -l \ + | pigz \ + > files_missing_sha256.ingest_request.json.gz + # 519k 0:00:19 [26.5k/s] + +So about 2k filtered out, will investigate later. + + zcat files_missing_sha256.ingest_request.json.gz \ + | shuf -n1000 \ + > files_missing_sha256.ingest_request.sample.json + + head -n100 files_missing_sha256.ingest_request.sample.json | ./ingest_tool.py requests --no-spn2 - > sample_results.json + 4 "no-capture" + 1 "no-pdf-link" + 95 "success" + +Seems like this is going to be a good start, but will need iteration. + +Dev testing: + + head files_missing_sha256.ingest_request.sample.json \ + | ./ingest_tool.py file-requests-backfill - --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 \ + > out_sample.json + + +## Commands + +Production warm-up: + + cat /srv/sandcrawler/tasks/files_missing_sha256.ingest_request.sample.json \ + | ./ingest_tool.py file-requests-backfill - --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 \ + > /srv/sandcrawler/tasks/files_missing_sha256.ingest_results.sample.json + +Production parallel run: + + zcat /srv/sandcrawler/tasks/files_missing_sha256.ingest_request.json \ + | parallel -j24 --linebuffer --round-robin --pipe ./ingest_tool.py file-requests-backfill - --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 \ + > /srv/sandcrawler/tasks/files_missing_sha256.ingest_results.json + +Filter and select file meta for import: + + head files_missing_sha256.ingest_results.json \ + | rg '"sha256hex"' \ + | jq 'select(.request.ext_ids.sha1 == .file_meta.sha1hex) | .file_meta' -c \ + > files_missing_sha256.file_meta.json + # Worker: Counter({'total': 20925, 'success': 20003, 'no-capture': 545, 'link-loop': 115, 'wrong-mimetype': 104, 'redirect-loop': 46, 'wayback-error': 25, 'null-body': 20, 'no-pdf-link': 18, 'skip-url-blocklist': 17, 'terminal-bad-status': 16, 'cdx-error': 9, 'wayback-content-error': 4, 'blocked-cookie': 3}) + # [etc] + + +Had some GROBID issues, so are not going to be able to get everything in first +pass. Merge our partial results, as just `file_meta`: + + cat files_missing_sha256.ingest_results.batch1.json files_missing_sha256.ingest_results.json \ + | jq .file_meta -c \ + | rg '"sha256hex"' \ + | pv -l \ + > files_missing_sha256.file_meta.json + # 386k 0:00:41 [9.34k/s] + +A bunch of these will need to be re-run once GROBID is in a healthier place. + +Check that we don't have (many) dupes: + + cat files_missing_sha256.file_meta.json \ + | jq .sha1hex -r \ + | sort \ + | uniq -D \ + | wc -l + # 86520 + +Huh, seems like a weirdly large number. Maybe related to re-crawling? Will need +to dedupe by sha1hex. + +Check how many dupes in original: + + zcat files_missing_sha256.ingest_request.json.gz | jq .ext_ids.sha1 -r | sort | uniq -D | wc -l + +That lines up with dupes expected before SHA-1 de-dupe run. + + cat files_missing_sha256.file_meta.json \ + | sort -u -S 4G \ + | pv -l \ + > files_missing_sha256.file_meta.uniq.json + + cat files_missing_sha256.file_meta.uniq.json \ + | jq .sha1hex -r \ + | sort \ + | uniq -D \ + | wc -l + # 0 + +Have seen a lot of errors like: + + %4|1637808915.562|TERMINATE|rdkafka#producer-1| [thrd:app]: Producer terminating with 1 message (650 bytes) still in queue or transit: use flush() to wait for outstanding message delivery + +TODO: add manual `finish()` calls on sinks in tool `run` function + +## QA Testing + + export FATCAT_API_AUTH_TOKEN... # sandcrawler-bot + + cat /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.sample.json \ + | ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + # Counter({'total': 1000, 'update': 503, 'skip-existing-complete': 403, 'skip-no-match': 94, 'skip': 0, 'insert': 0, 'exists': 0}) + + head -n1000 /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.json \ + | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + # Counter({'total': 1000, 'update': 481, 'skip-existing-complete': 415, 'skip-no-match': 104, 'skip': 0, 'insert': 0, 'exists': 0}) + diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py new file mode 100755 index 00000000..a005837f --- /dev/null +++ b/notes/cleanups/scripts/file2ingestrequest.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +from typing import Optional +import json, sys + + +def transform(row: dict) -> Optional[dict]: + if row.get('mimetype') not in [None, 'application/pdf']: + return None + if row.get('state') != 'active': + return None + base_url = None + for url in (row.get('urls') or []): + url = url['url'] + if '://web.archive.org/' not in url and '://archive.org/' not in url: + base_url = url + break + if not base_url: + return None + if not row.get('sha1'): + return None + return dict( + base_url=base_url, + ingest_type="pdf", + link_source="fatcat", + link_source_id=f"file_{row['ident']}", + ingest_request_source="file-backfill", + ext_ids=dict( + sha1=row['sha1'], + ), + ) + + +def run(): + for l in sys.stdin: + if not l.strip(): + continue + row = json.loads(l) + request = transform(row) + if request: + print(json.dumps(request, sort_keys=True)) + +if __name__=="__main__": + run() -- cgit v1.2.3