aboutsummaryrefslogtreecommitdiffstats
path: root/notes/cleanups
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-24 19:58:20 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-24 19:58:20 -0800
commiteb60449cdc9614ec7eda79b8481d1d8487b9a5f6 (patch)
tree28e396669b9758447bc35bd2190608ce5c4116c1 /notes/cleanups
parent75bde4ad3970e8e63b04009cfd16ed4b9a924ce7 (diff)
downloadfatcat-eb60449cdc9614ec7eda79b8481d1d8487b9a5f6.tar.gz
fatcat-eb60449cdc9614ec7eda79b8481d1d8487b9a5f6.zip
notes on file_meta partial cleanup
Diffstat (limited to 'notes/cleanups')
-rw-r--r--notes/cleanups/file_meta.md152
-rwxr-xr-xnotes/cleanups/scripts/file2ingestrequest.py44
2 files changed, 196 insertions, 0 deletions
diff --git a/notes/cleanups/file_meta.md b/notes/cleanups/file_meta.md
new file mode 100644
index 00000000..d99e821e
--- /dev/null
+++ b/notes/cleanups/file_meta.md
@@ -0,0 +1,152 @@
+
+Over 500k file entities still lack complete metadata. For example, SHA-256
+checksums and verified mimetypes.
+
+Presumably these also lack GROBID processing. It seems that most or all of
+these are simply wayback captures with no CDX metadata in sandcrawler-db, so
+they didn't get update in prior cleanups.
+
+Current plan, re-using existing tools and processes, is to:
+
+1. create stub ingest requests containing file idents
+2. process them "locally" on a large VM, in 'bulk' mode; writing output to stdout but using regular grobid and pdfextract "sinks" to Kafka
+3. transform ingest results to a form for existing `file_meta` importer
+4. run imports
+
+The `file_meta` importer requires just the `file_meta` dict from sandcrawler.
+
+## Prep
+
+ zcat file_hashes.tsv.gz | pv -l | rg '\t\t' | wc -l
+ # 521,553
+
+ zcat file_export.json.gz \
+ | rg -v '"sha256":' \
+ | pv -l \
+ | pigz \
+ > files_missing_sha256.json.gz
+ # 521k 0:10:21 [ 839 /s]
+
+Want ingest requests with:
+
+ base_url: str
+ ingest_type: "pdf"
+ link_source: "fatcat"
+ link_source_id: file ident (with "file_" prefix)
+ ingest_request_source: "file-backfill"
+ ext_ids:
+ sha1: str
+
+Use `file2ingestrequest.py` helper:
+
+ zcat files_missing_sha256.json.gz \
+ | ./file2ingestrequest.py \
+ | pv -l \
+ | pigz \
+ > files_missing_sha256.ingest_request.json.gz
+ # 519k 0:00:19 [26.5k/s]
+
+So about 2k filtered out, will investigate later.
+
+ zcat files_missing_sha256.ingest_request.json.gz \
+ | shuf -n1000 \
+ > files_missing_sha256.ingest_request.sample.json
+
+ head -n100 files_missing_sha256.ingest_request.sample.json | ./ingest_tool.py requests --no-spn2 - > sample_results.json
+ 4 "no-capture"
+ 1 "no-pdf-link"
+ 95 "success"
+
+Seems like this is going to be a good start, but will need iteration.
+
+Dev testing:
+
+ head files_missing_sha256.ingest_request.sample.json \
+ | ./ingest_tool.py file-requests-backfill - --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 \
+ > out_sample.json
+
+
+## Commands
+
+Production warm-up:
+
+ cat /srv/sandcrawler/tasks/files_missing_sha256.ingest_request.sample.json \
+ | ./ingest_tool.py file-requests-backfill - --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 \
+ > /srv/sandcrawler/tasks/files_missing_sha256.ingest_results.sample.json
+
+Production parallel run:
+
+ zcat /srv/sandcrawler/tasks/files_missing_sha256.ingest_request.json \
+ | parallel -j24 --linebuffer --round-robin --pipe ./ingest_tool.py file-requests-backfill - --kafka-env qa --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --grobid-host http://localhost:8070 \
+ > /srv/sandcrawler/tasks/files_missing_sha256.ingest_results.json
+
+Filter and select file meta for import:
+
+ head files_missing_sha256.ingest_results.json \
+ | rg '"sha256hex"' \
+ | jq 'select(.request.ext_ids.sha1 == .file_meta.sha1hex) | .file_meta' -c \
+ > files_missing_sha256.file_meta.json
+ # Worker: Counter({'total': 20925, 'success': 20003, 'no-capture': 545, 'link-loop': 115, 'wrong-mimetype': 104, 'redirect-loop': 46, 'wayback-error': 25, 'null-body': 20, 'no-pdf-link': 18, 'skip-url-blocklist': 17, 'terminal-bad-status': 16, 'cdx-error': 9, 'wayback-content-error': 4, 'blocked-cookie': 3})
+ # [etc]
+
+
+Had some GROBID issues, so are not going to be able to get everything in first
+pass. Merge our partial results, as just `file_meta`:
+
+ cat files_missing_sha256.ingest_results.batch1.json files_missing_sha256.ingest_results.json \
+ | jq .file_meta -c \
+ | rg '"sha256hex"' \
+ | pv -l \
+ > files_missing_sha256.file_meta.json
+ # 386k 0:00:41 [9.34k/s]
+
+A bunch of these will need to be re-run once GROBID is in a healthier place.
+
+Check that we don't have (many) dupes:
+
+ cat files_missing_sha256.file_meta.json \
+ | jq .sha1hex -r \
+ | sort \
+ | uniq -D \
+ | wc -l
+ # 86520
+
+Huh, seems like a weirdly large number. Maybe related to re-crawling? Will need
+to dedupe by sha1hex.
+
+Check how many dupes in original:
+
+ zcat files_missing_sha256.ingest_request.json.gz | jq .ext_ids.sha1 -r | sort | uniq -D | wc -l
+
+That lines up with dupes expected before SHA-1 de-dupe run.
+
+ cat files_missing_sha256.file_meta.json \
+ | sort -u -S 4G \
+ | pv -l \
+ > files_missing_sha256.file_meta.uniq.json
+
+ cat files_missing_sha256.file_meta.uniq.json \
+ | jq .sha1hex -r \
+ | sort \
+ | uniq -D \
+ | wc -l
+ # 0
+
+Have seen a lot of errors like:
+
+ %4|1637808915.562|TERMINATE|rdkafka#producer-1| [thrd:app]: Producer terminating with 1 message (650 bytes) still in queue or transit: use flush() to wait for outstanding message delivery
+
+TODO: add manual `finish()` calls on sinks in tool `run` function
+
+## QA Testing
+
+ export FATCAT_API_AUTH_TOKEN... # sandcrawler-bot
+
+ cat /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.sample.json \
+ | ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta -
+ # Counter({'total': 1000, 'update': 503, 'skip-existing-complete': 403, 'skip-no-match': 94, 'skip': 0, 'insert': 0, 'exists': 0})
+
+ head -n1000 /srv/fatcat/datasets/files_missing_sha256.file_meta.uniq.json \
+ | parallel -j8 --round-robin --pipe -q ./fatcat_import.py --editgroup-description-override 'backfill of full file-level metadata for early-imported papers' file-meta -
+ # Counter({'total': 1000, 'update': 481, 'skip-existing-complete': 415, 'skip-no-match': 104, 'skip': 0, 'insert': 0, 'exists': 0})
+
diff --git a/notes/cleanups/scripts/file2ingestrequest.py b/notes/cleanups/scripts/file2ingestrequest.py
new file mode 100755
index 00000000..a005837f
--- /dev/null
+++ b/notes/cleanups/scripts/file2ingestrequest.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+from typing import Optional
+import json, sys
+
+
+def transform(row: dict) -> Optional[dict]:
+ if row.get('mimetype') not in [None, 'application/pdf']:
+ return None
+ if row.get('state') != 'active':
+ return None
+ base_url = None
+ for url in (row.get('urls') or []):
+ url = url['url']
+ if '://web.archive.org/' not in url and '://archive.org/' not in url:
+ base_url = url
+ break
+ if not base_url:
+ return None
+ if not row.get('sha1'):
+ return None
+ return dict(
+ base_url=base_url,
+ ingest_type="pdf",
+ link_source="fatcat",
+ link_source_id=f"file_{row['ident']}",
+ ingest_request_source="file-backfill",
+ ext_ids=dict(
+ sha1=row['sha1'],
+ ),
+ )
+
+
+def run():
+ for l in sys.stdin:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+ request = transform(row)
+ if request:
+ print(json.dumps(request, sort_keys=True))
+
+if __name__=="__main__":
+ run()