diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-24 19:58:20 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-24 19:58:20 -0800 |
commit | eb60449cdc9614ec7eda79b8481d1d8487b9a5f6 (patch) | |
tree | 28e396669b9758447bc35bd2190608ce5c4116c1 /notes/cleanups | |
parent | 75bde4ad3970e8e63b04009cfd16ed4b9a924ce7 (diff) | |
download | fatcat-eb60449cdc9614ec7eda79b8481d1d8487b9a5f6.tar.gz fatcat-eb60449cdc9614ec7eda79b8481d1d8487b9a5f6.zip |
notes on file_meta partial cleanup
Diffstat (limited to 'notes/cleanups')
-rw-r--r-- | notes/cleanups/file_meta.md | 152 | ||||
-rwxr-xr-x | notes/cleanups/scripts/file2ingestrequest.py | 44 |
2 files changed, 196 insertions, 0 deletions
diff --git a/notes/cleanups/file_meta.md b/notes/cleanups/file_meta.md new file mode 100644 index 00000000..d99e821e --- /dev/null +++ b/notes/cleanups/file_meta.md @@ -0,0 +1,152 @@ + +Over 500k file entities still lack complete metadata. For example, SHA-256 +checksums and verified mimetypes. + +Presumably these also lack GROBID processing. It seems that most or all of +these are simply wayback captures with no CDX metadata in sandcrawler-db, so +they didn't get update in prior cleanups. + +Current plan, re-using existing tools and processes, is to: + +1. create stub ingest requests containing file idents +2. process them "locally" on a large VM, in 'bulk' mode; writing output to stdout but using regular grobid and pdfextract "sinks" to Kafka +3. transform ingest results to a form for existing `file_meta` importer +4. run imports + +The `file_meta` importer requires just the `file_meta` dict from sandcrawler. + +## Prep + + zcat file_hashes.tsv.gz | pv -l | rg '\t\t' | wc -l + # 521,553 + + zcat file_export.json.gz \ + | rg -v '"sha256":' \ + | pv -l \ + | pigz \ + > files_missing_sha256.json.gz + # 521k 0:10:21 [ 839 /s] + +Want ingest requests with: + + base_url: str + ingest_type: "pdf" + link_source: "fatcat" + link_source_id: file ident (with "file_" prefix) + ingest_request_source: "file-backfill" + ext_ids: + sha1: str + +Use `file2ingestrequest.py` helper: + + zcat files_missing_sha256.json.gz \ + | ./file2ingestrequest.py \ + | pv -l \ + | pigz \ + > files_missing_sha256.ingest_request.json.gz + # 519k 0:00:19 [26.5k/s] + +So about 2k filtered out, will investigate later. + + zcat files_missing_sha256.ingest_request.json.gz \ + | shuf -n1000 \ + > files_missing_sha256.ingest_request.sample.json + + head -n100 files_missing_sha256.ingest_request.sample.json | ./ingest_tool.py requests --no-spn2 - > sample_results.json + 4 "no-capture" + 1 "no-pdf-link" + 95 "success" + +Seems like this is going to be a good start, but will need iteration. + +Dev testing: + + head files_missing_sha256.ingest_request.sample.json \ + | ./ingest_tool.py file-requests-backfill - --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 \ + > out_sample.json + + +## Commands + +Production warm-up: + + cat /srv/sandcrawler/tasks/files_missing_sha256.ingest_request.sample.json \ + | ./ingest_tool.py file-requests-backfill - --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 \ + > /srv/sandcrawler/tasks/files_missing_sha256.ingest_results.sample.json + +Production parallel run: + + zcat /srv/sandcrawler/tasks/files_missing_sha256.ingest_request.json \ + | parallel -j24 --linebuffer --round-robin --pipe ./ingest_tool.py file-requests-backfill - --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 \ + > /srv/sandcrawler/tasks/files_missing_sha256.ingest_results.json + +Filter and select file meta for import: + + head files_missing_sha256.ingest_results.json \ + | rg '"sha256hex"' \ + | jq 'select(.request.ext_ids.sha1 == .file_meta.sha1hex) | .file_meta' -c \ + > files_missing_sha256.file_meta.json + # Worker: Counter({'total': 20925, 'success': 20003, 'no-capture': 545, 'link-loop': 115, 'wrong-mimetype': 104, 'redirect-loop': 46, 'wayback-error': 25, 'null-body': 20, 'no-pdf-link': 18, 'skip-url-blocklist': 17, 'terminal-bad-status': 16, 'cdx-error': 9, 'wayback-content-error': 4, 'blocked-cookie': 3}) + # [etc] + + +Had some GROBID issues, so are not going to be able to get everything in first +pass. Merge our partial results, as just `file_meta`: + + cat files_missing_sha256.ingest_results.batch1.json files_missing_sha256.ingest_results.json \ + | jq .file_meta -c \ + | rg '"sha256hex"' \ + | pv -l \ + > files_missing_sha256.file_meta.json + # 386k 0:00:41 [9.34k/s] + +A bunch of these will need to be re-run once GROBID is in a healthier place. + +Check that we don't have (many) dupes: + + cat files_missing_sha256.file_meta.json \ + | jq .sha1hex -r \ + | sort \ + | uniq -D \ + | wc -l + # 86520 + +Huh, seems like a weirdly large number. Maybe related to re-crawling? Will need +to dedupe by sha1hex. + +Check how many dupes in original: + + zcat files_missing_sha256.ingest_request.json.gz | jq .ext_ids.sha1 -r | sort | uniq -D | wc -l + +That lines up with dupes expected before SHA-1 de-dupe run. + + cat files_missing_sha256.file_meta.json \ + | sort -u -S 4G \ + | pv -l \ + > files_missing_sha256.file_meta.uniq.json + + cat files_missing_sha256.file_meta.uniq.json \ + | jq .sha1hex -r \ + | sort \ + | uniq -D \ + | wc -l + # 0 + +Have seen a lot of errors like: + + %4|1637808915.562|TERMINATE|rdkafka#producer-1| [thrd:app]: Producer terminating with 1 message (650 bytes) still in queue or transit: use flush() to wait for outstanding message delivery + +TODO: add manual `finish()` calls on sinks in tool `run` function + +## QA Testing + + export FATCAT_API_AUTH_TOKEN... # sandcrawler-bot + + cat /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.sample.json \ + | ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + # Counter({'total': 1000, 'update': 503, 'skip-existing-complete': 403, 'skip-no-match': 94, 'skip': 0, 'insert': 0, 'exists': 0}) + + head -n1000 /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.json \ + | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta - + # Counter({'total': 1000, 'update': 481, 'skip-existing-complete': 415, 'skip-no-match': 104, 'skip': 0, 'insert': 0, 'exists': 0}) + diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py new file mode 100755 index 00000000..a005837f --- /dev/null +++ b/notes/cleanups/scripts/file2ingestrequest.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +from typing import Optional +import json, sys + + +def transform(row: dict) -> Optional[dict]: + if row.get('mimetype') not in [None, 'application/pdf']: + return None + if row.get('state') != 'active': + return None + base_url = None + for url in (row.get('urls') or []): + url = url['url'] + if '://web.archive.org/' not in url and '://archive.org/' not in url: + base_url = url + break + if not base_url: + return None + if not row.get('sha1'): + return None + return dict( + base_url=base_url, + ingest_type="pdf", + link_source="fatcat", + link_source_id=f"file_{row['ident']}", + ingest_request_source="file-backfill", + ext_ids=dict( + sha1=row['sha1'], + ), + ) + + +def run(): + for l in sys.stdin: + if not l.strip(): + continue + row = json.loads(l) + request = transform(row) + if request: + print(json.dumps(request, sort_keys=True)) + +if __name__=="__main__": + run() |