From 8cc3cebd2392d16026214f5e92b99a322ef2e044 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 2 Sep 2020 16:10:13 -0700 Subject: follow-up notes on processing 'holes' --- notes/tasks/2020-07-22_processing_holes.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'notes') diff --git a/notes/tasks/2020-07-22_processing_holes.md b/notes/tasks/2020-07-22_processing_holes.md index 363989a..70e2b59 100644 --- a/notes/tasks/2020-07-22_processing_holes.md +++ b/notes/tasks/2020-07-22_processing_holes.md @@ -18,6 +18,11 @@ Full batch: cat dump_unextracted_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 +Re-ran on 2020-08-19: + + wc -l dump_unextracted_pdf_petabox.2020-08-19.json + 971194 dump_unextracted_pdf_petabox.2020-08-19.json + ## `pdf_meta` missing CDX rows First, the GROBID-ized rows but only if has a fatcat file as well. @@ -26,6 +31,13 @@ First, the GROBID-ized rows but only if has a fatcat file as well. cat dump_unextracted_pdf.fatcat.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 +Re-ran on 2020-08-19: + + wc -l dump_unextracted_pdf.fatcat.2020-08-19.json + 65517 dump_unextracted_pdf.fatcat.2020-08-19.json + +Enqueued! + ## `GROBID` missing petabox rows wc -l /grande/snapshots/dump_ungrobided_pdf_petabox.2020-07-22.json @@ -39,6 +51,13 @@ Full batch: cat dump_ungrobided_pdf_petabox.2020-07-22.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 +Re-ran on 2020-08-19: + + wc -l dump_ungrobided_pdf_petabox.2020-08-19.json + 933 dump_ungrobided_pdf_petabox.2020-08-19.json + +Enqueued! + ## `GROBID` for missing CDX rows in fatcat wc -l dump_ungrobided_pdf.fatcat.2020-07-22.json -- cgit v1.2.3