From e5c021bfeb03c50924160616dc64d44617d45933 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 9 Dec 2021 14:12:18 -0800 Subject: notes on re-GROBID-ing (and re-extracting) some files --- notes/tasks/2021-12-06_regrobid.md | 289 +++++++++++++++++++++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 notes/tasks/2021-12-06_regrobid.md diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md new file mode 100644 index 0000000..65e9fe3 --- /dev/null +++ b/notes/tasks/2021-12-06_regrobid.md @@ -0,0 +1,289 @@ + +Want to test recent updates of GROBID (to fix regex issue), and also re-process +a number of PDFs which failed to process with GROBID initially. + + +## HTTP 503 + +These are attempts which failed because GROBID was too busy or not running. + + # IMPROVED BELOW + COPY ( + SELECT row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.status_code = 503 + AND cdx.sha1hex IS NOT NULL + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json' + WITH NULL ''; + # COPY 4749 + +Not actually that many, which seems good. Confirm that these are uniq by sha1hex: + + cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l + # 302 + +Nope! Need to add "distinct on": + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.status_code = 503 + AND cdx.sha1hex IS NOT NULL + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json' + WITH NULL ''; + # COPY 4297 + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +## Never Processed CDX + +PDFs in fatcat which have never been processed with GROBID. + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM fatcat_file + LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex + LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex + WHERE + grobid.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL) + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json' + WITH NULL ''; + # COPY 15488 + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + +PDFs in fatcat which have never been processed with pdfextract. + + # TODO + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM fatcat_file + LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex + LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex + LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex + WHERE + pdf_meta.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND cdx.mimetype = 'application/pdf' + AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL) + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json' + WITH NULL ''; + # COPY 45535 + + cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + # 45.5k 0:00:01 [30.2k/s] + +## Timeout or Failure + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex + WHERE + (grobid.status_code = 500 OR grobid.status_code = -4) + AND cdx.sha1hex IS NOT NULL + AND file_meta.mimetype = 'application/pdf' + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json' + WITH NULL ''; + # COPY 8,084,296 + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +This seems to not be working very well, mostly errors, empty docs, etc. Will +roll-forward the kafka consumer group after attempting a couple hundred +thousand of these. + +Let's try limiting to files actually in fatcat: + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + (grobid.status_code = 500 OR grobid.status_code = -4) + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + AND file_meta.mimetype = 'application/pdf' + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json' + WITH NULL ''; + # COPY 529265 + +That is a much more managable batch to retry. + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 529k 0:00:17 [31.0k/s] + + +## Missing Fatcat Files + +There were around a half million fatcat file entities which didn't have `cdx` +rows in sandcrawler. Did some specific pdfextract processing; now we should do +GROBID ingest as well. + +Enque the `CDX` objects for GROBID and pdfextract processing: + + zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 354k 0:00:11 [30.6k/s] + + zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +And some earlier files of interest on `aitio`: + + cat files_missing_sha256.ingest_results.json \ + | rg '"application/pdf"' \ + | rg -v "\\\\" \ + | jq .cdx -c \ + | sort -u -S 4G \ + | pv -l \ + > files_missing_sha256.cdx.uniq.json + # 100k 0:00:47 [2.09k/s] + + cat files_missing_sha256.cdx.uniq.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + cat files_missing_sha256.cdx.uniq.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +## General Counts + +How many fatcat files of what mimetype (reported in sandcrawler-db)? + + SELECT file_meta.mimetype, COUNT(*) + FROM fatcat_file + LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex + WHERE + fatcat_file.first_release_ident IS NOT NULL + AND fatcat_file.any_url = true + AND content_scope IS NULL + GROUP BY file_meta.mimetype + ORDER BY COUNT(*) DESC + LIMIT 25; + + mimetype | count + ---------------------------------------------------------------------------+---------- + application/pdf | 45227033 + | 433068 + application/octet-stream | 30634 + application/jats+xml | 6874 + text/html | 876 + application/postscript | 199 + application/gzip | 173 + text/plain | 84 + application/xml | 48 + application/vnd.ms-powerpoint | 38 + application/msword | 16 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 8 + image/jpeg | 6 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 4 + message/rfc822 | 4 + application/zip | 4 + text/x-tex | 3 + application/x-dosexec | 3 + application/x-tar | 2 + application/vnd.ms-tnef | 2 + image/svg+xml | 1 + image/tiff | 1 + image/png | 1 + image/gif | 1 + application/vnd.ms-office | 1 + (25 rows) + + +PDF extract status? + + SELECT pdf_meta.status, COUNT(*) + FROM fatcat_file + LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex + WHERE + fatcat_file.first_release_ident IS NOT NULL + AND fatcat_file.any_url = true + AND content_scope IS NULL + GROUP BY pdf_meta.status + ORDER BY COUNT(*) DESC + LIMIT 25; + + status | count + ----------------+---------- + success | 43415920 + | 2018522 + text-too-large | 122730 + parse-error | 94876 + not-pdf | 32156 + error-wayback | 14504 + bad-unicode | 279 + bad-pdf | 98 + empty-blob | 2 + (9 rows) + + +What are the GROBID status codes for fatcat files? Narrowed down: + + SELECT grobid.status, grobid.status_code, COUNT(*) + FROM fatcat_file + LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex + WHERE + fatcat_file.first_release_ident IS NOT NULL + AND fatcat_file.any_url = true + AND content_scope IS NULL + GROUP BY grobid.status, grobid.status_code + ORDER BY COUNT(*) DESC + LIMIT 25; + + status | status_code | count + ----------------+-------------+---------- + success | 200 | 44409069 + error | 500 | 580402 + | | 468836 + | 200 | 240660 + error-timeout | -4 | 79 + bad-grobid-xml | 200 | 38 + error | 200 | 3 + (7 rows) + -- cgit v1.2.3