aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-09 14:12:18 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-09 14:12:18 -0800
commite5c021bfeb03c50924160616dc64d44617d45933 (patch)
treeebae0cfe06c683f3945d2fb5255ca182b9244081
parent89b5f51e57d3a0cc043640262e396e28297e7c00 (diff)
downloadsandcrawler-e5c021bfeb03c50924160616dc64d44617d45933.tar.gz
sandcrawler-e5c021bfeb03c50924160616dc64d44617d45933.zip
notes on re-GROBID-ing (and re-extracting) some filestrawler
-rw-r--r--notes/tasks/2021-12-06_regrobid.md289
1 files changed, 289 insertions, 0 deletions
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md
new file mode 100644
index 0000000..65e9fe3
--- /dev/null
+++ b/notes/tasks/2021-12-06_regrobid.md
@@ -0,0 +1,289 @@
+
+Want to test recent updates of GROBID (to fix regex issue), and also re-process
+a number of PDFs which failed to process with GROBID initially.
+
+
+## HTTP 503
+
+These are attempts which failed because GROBID was too busy or not running.
+
+ # IMPROVED BELOW
+ COPY (
+ SELECT row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.status_code = 503
+ AND cdx.sha1hex IS NOT NULL
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+ WITH NULL '';
+ # COPY 4749
+
+Not actually that many, which seems good. Confirm that these are uniq by sha1hex:
+
+ cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l
+ # 302
+
+Nope! Need to add "distinct on":
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ WHERE
+ grobid.status_code = 503
+ AND cdx.sha1hex IS NOT NULL
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+ WITH NULL '';
+ # COPY 4297
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## Never Processed CDX
+
+PDFs in fatcat which have never been processed with GROBID.
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM fatcat_file
+ LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+ LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+ WHERE
+ grobid.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json'
+ WITH NULL '';
+ # COPY 15488
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+
+PDFs in fatcat which have never been processed with pdfextract.
+
+ # TODO
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM fatcat_file
+ LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+ LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex
+ LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+ WHERE
+ pdf_meta.sha1hex IS NULL
+ AND cdx.sha1hex IS NOT NULL
+ AND cdx.mimetype = 'application/pdf'
+ AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json'
+ WITH NULL '';
+ # COPY 45535
+
+ cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+ # 45.5k 0:00:01 [30.2k/s]
+
+## Timeout or Failure
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+ WHERE
+ (grobid.status_code = 500 OR grobid.status_code = -4)
+ AND cdx.sha1hex IS NOT NULL
+ AND file_meta.mimetype = 'application/pdf'
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json'
+ WITH NULL '';
+ # COPY 8,084,296
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+This seems to not be working very well, mostly errors, empty docs, etc. Will
+roll-forward the kafka consumer group after attempting a couple hundred
+thousand of these.
+
+Let's try limiting to files actually in fatcat:
+
+ COPY (
+ SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+ FROM grobid
+ LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+ LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE
+ (grobid.status_code = 500 OR grobid.status_code = -4)
+ AND cdx.sha1hex IS NOT NULL
+ AND fatcat_file.sha1hex IS NOT NULL
+ AND file_meta.mimetype = 'application/pdf'
+ -- sort of arbitary "not recently" date filter
+ AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+ -- LIMIT 5;
+ )
+ TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json'
+ WITH NULL '';
+ # COPY 529265
+
+That is a much more managable batch to retry.
+
+ cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 529k 0:00:17 [31.0k/s]
+
+
+## Missing Fatcat Files
+
+There were around a half million fatcat file entities which didn't have `cdx`
+rows in sandcrawler. Did some specific pdfextract processing; now we should do
+GROBID ingest as well.
+
+Enque the `CDX` objects for GROBID and pdfextract processing:
+
+ zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+ # 354k 0:00:11 [30.6k/s]
+
+ zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And some earlier files of interest on `aitio`:
+
+ cat files_missing_sha256.ingest_results.json \
+ | rg '"application/pdf"' \
+ | rg -v "\\\\" \
+ | jq .cdx -c \
+ | sort -u -S 4G \
+ | pv -l \
+ > files_missing_sha256.cdx.uniq.json
+ # 100k 0:00:47 [2.09k/s]
+
+ cat files_missing_sha256.cdx.uniq.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+ cat files_missing_sha256.cdx.uniq.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | pv -l \
+ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+## General Counts
+
+How many fatcat files of what mimetype (reported in sandcrawler-db)?
+
+ SELECT file_meta.mimetype, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY file_meta.mimetype
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+----------
+ application/pdf | 45227033
+ | 433068
+ application/octet-stream | 30634
+ application/jats+xml | 6874
+ text/html | 876
+ application/postscript | 199
+ application/gzip | 173
+ text/plain | 84
+ application/xml | 48
+ application/vnd.ms-powerpoint | 38
+ application/msword | 16
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 8
+ image/jpeg | 6
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 4
+ message/rfc822 | 4
+ application/zip | 4
+ text/x-tex | 3
+ application/x-dosexec | 3
+ application/x-tar | 2
+ application/vnd.ms-tnef | 2
+ image/svg+xml | 1
+ image/tiff | 1
+ image/png | 1
+ image/gif | 1
+ application/vnd.ms-office | 1
+ (25 rows)
+
+
+PDF extract status?
+
+ SELECT pdf_meta.status, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY pdf_meta.status
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ status | count
+ ----------------+----------
+ success | 43415920
+ | 2018522
+ text-too-large | 122730
+ parse-error | 94876
+ not-pdf | 32156
+ error-wayback | 14504
+ bad-unicode | 279
+ bad-pdf | 98
+ empty-blob | 2
+ (9 rows)
+
+
+What are the GROBID status codes for fatcat files? Narrowed down:
+
+ SELECT grobid.status, grobid.status_code, COUNT(*)
+ FROM fatcat_file
+ LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex
+ WHERE
+ fatcat_file.first_release_ident IS NOT NULL
+ AND fatcat_file.any_url = true
+ AND content_scope IS NULL
+ GROUP BY grobid.status, grobid.status_code
+ ORDER BY COUNT(*) DESC
+ LIMIT 25;
+
+ status | status_code | count
+ ----------------+-------------+----------
+ success | 200 | 44409069
+ error | 500 | 580402
+ | | 468836
+ | 200 | 240660
+ error-timeout | -4 | 79
+ bad-grobid-xml | 200 | 38
+ error | 200 | 3
+ (7 rows)
+