notes on re-GROBID-ing (and re-extracting) some filestrawler

author: Bryan Newbold <bnewbold@archive.org> 2021-12-09 14:12:18 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2021-12-09 14:12:18 -0800
commit: e5c021bfeb03c50924160616dc64d44617d45933 (patch)
tree: ebae0cfe06c683f3945d2fb5255ca182b9244081 /notes
parent: 89b5f51e57d3a0cc043640262e396e28297e7c00 (diff)
download: sandcrawler-trawler.tar.gz
sandcrawler-trawler.zip
1 files changed, 289 insertions, 0 deletions
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md
new file mode 100644
index 0000000..65e9fe3
--- /dev/null
+++ b/notes/tasks/2021-12-06_regrobid.md
@@ -0,0 +1,289 @@
+
+Want to test recent updates of GROBID (to fix regex issue), and also re-process
+a number of PDFs which failed to process with GROBID initially.
+
+
+## HTTP 503
+
+These are attempts which failed because GROBID was too busy or not running.
+
+    # IMPROVED BELOW
+    COPY (
+        SELECT row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        WHERE
+            grobid.status_code = 503
+            AND cdx.sha1hex IS NOT NULL
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+    WITH NULL '';
+    # COPY 4749
+
+Not actually that many, which seems good. Confirm that these are uniq by sha1hex:
+
+    cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l
+    # 302
+
+Nope! Need to add "distinct on":
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        WHERE
+            grobid.status_code = 503
+            AND cdx.sha1hex IS NOT NULL
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
+    WITH NULL '';
+    # COPY 4297
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+## Never Processed CDX
+
+PDFs in fatcat which have never been processed with GROBID.
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM fatcat_file
+        LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+        LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex
+        LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+        WHERE
+            grobid.sha1hex IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json'
+    WITH NULL '';
+    # COPY 15488
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+
+PDFs in fatcat which have never been processed with pdfextract.
+
+    # TODO
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM fatcat_file
+        LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
+        LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex
+        LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
+        WHERE
+            pdf_meta.sha1hex IS NULL
+            AND cdx.sha1hex IS NOT NULL
+            AND cdx.mimetype = 'application/pdf'
+            AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json'
+    WITH NULL '';
+    # COPY 45535
+
+    cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+    # 45.5k 0:00:01 [30.2k/s]
+
+## Timeout or Failure
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+        WHERE
+            (grobid.status_code = 500 OR grobid.status_code = -4)
+            AND cdx.sha1hex IS NOT NULL
+            AND file_meta.mimetype  = 'application/pdf'
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json'
+    WITH NULL '';
+    # COPY 8,084,296
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+This seems to not be working very well, mostly errors, empty docs, etc. Will
+roll-forward the kafka consumer group after attempting a couple hundred
+thousand of these.
+
+Let's try limiting to files actually in fatcat:
+
+    COPY (
+        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
+        FROM grobid
+        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
+        LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
+        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+        WHERE
+            (grobid.status_code = 500 OR grobid.status_code = -4)
+            AND cdx.sha1hex IS NOT NULL
+            AND fatcat_file.sha1hex IS NOT NULL
+            AND file_meta.mimetype  = 'application/pdf'
+            -- sort of arbitary "not recently" date filter
+            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
+        -- LIMIT 5;
+    )
+    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json'
+    WITH NULL '';
+    # COPY 529265
+
+That is a much more managable batch to retry.
+
+    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+    # 529k 0:00:17 [31.0k/s]
+
+
+## Missing Fatcat Files
+
+There were around a half million fatcat file entities which didn't have `cdx`
+rows in sandcrawler. Did some specific pdfextract processing; now we should do
+GROBID ingest as well.
+
+Enque the `CDX` objects for GROBID and pdfextract processing:
+
+    zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+    # 354k 0:00:11 [30.6k/s]
+
+    zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+And some earlier files of interest on `aitio`:
+
+    cat files_missing_sha256.ingest_results.json \
+        | rg '"application/pdf"' \
+        | rg -v "\\\\" \
+        | jq .cdx -c \
+        | sort -u -S 4G \
+        | pv -l \
+        > files_missing_sha256.cdx.uniq.json
+    # 100k 0:00:47 [2.09k/s]
+
+    cat files_missing_sha256.cdx.uniq.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
+
+    cat files_missing_sha256.cdx.uniq.json \
+        | rg -v "\\\\" \
+        | jq . -c \
+        | pv -l \
+        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
+
+## General Counts
+
+How many fatcat files of what mimetype (reported in sandcrawler-db)?
+
+    SELECT file_meta.mimetype, COUNT(*)
+    FROM fatcat_file
+    LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex
+    WHERE
+        fatcat_file.first_release_ident IS NOT NULL
+        AND fatcat_file.any_url = true
+        AND content_scope IS NULL
+    GROUP BY file_meta.mimetype
+    ORDER BY COUNT(*) DESC
+    LIMIT 25;
+
+                                     mimetype                                  |  count
+    ---------------------------------------------------------------------------+----------
+     application/pdf                                                           | 45227033
+                                                                               |   433068
+     application/octet-stream                                                  |    30634
+     application/jats+xml                                                      |     6874
+     text/html                                                                 |      876
+     application/postscript                                                    |      199
+     application/gzip                                                          |      173
+     text/plain                                                                |       84
+     application/xml                                                           |       48
+     application/vnd.ms-powerpoint                                             |       38
+     application/msword                                                        |       16
+     application/vnd.openxmlformats-officedocument.wordprocessingml.document   |        8
+     image/jpeg                                                                |        6
+     application/vnd.openxmlformats-officedocument.presentationml.presentation |        4
+     message/rfc822                                                            |        4
+     application/zip                                                           |        4
+     text/x-tex                                                                |        3
+     application/x-dosexec                                                     |        3
+     application/x-tar                                                         |        2
+     application/vnd.ms-tnef                                                   |        2
+     image/svg+xml                                                             |        1
+     image/tiff                                                                |        1
+     image/png                                                                 |        1
+     image/gif                                                                 |        1
+     application/vnd.ms-office                                                 |        1
+    (25 rows)
+
+
+PDF extract status?
+
+    SELECT pdf_meta.status, COUNT(*)
+    FROM fatcat_file
+    LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex
+    WHERE
+        fatcat_file.first_release_ident IS NOT NULL
+        AND fatcat_file.any_url = true
+        AND content_scope IS NULL
+    GROUP BY pdf_meta.status
+    ORDER BY COUNT(*) DESC
+    LIMIT 25;
+
+         status     |  count
+    ----------------+----------
+     success        | 43415920
+                    |  2018522
+     text-too-large |   122730
+     parse-error    |    94876
+     not-pdf        |    32156
+     error-wayback  |    14504
+     bad-unicode    |      279
+     bad-pdf        |       98
+     empty-blob     |        2
+    (9 rows)
+
+
+What are the GROBID status codes for fatcat files? Narrowed down:
+
+    SELECT grobid.status, grobid.status_code, COUNT(*)
+    FROM fatcat_file
+    LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex
+    WHERE
+        fatcat_file.first_release_ident IS NOT NULL
+        AND fatcat_file.any_url = true
+        AND content_scope IS NULL
+    GROUP BY grobid.status, grobid.status_code
+    ORDER BY COUNT(*) DESC
+    LIMIT 25;
+
+         status     | status_code |  count
+    ----------------+-------------+----------
+     success        |         200 | 44409069
+     error          |         500 |   580402
+                    |             |   468836
+                    |         200 |   240660
+     error-timeout  |          -4 |       79
+     bad-grobid-xml |         200 |       38
+     error          |         200 |        3
+    (7 rows)
+
author	Bryan Newbold <bnewbold@archive.org>	2021-12-09 14:12:18 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2021-12-09 14:12:18 -0800
commit	e5c021bfeb03c50924160616dc64d44617d45933 (patch)
tree	ebae0cfe06c683f3945d2fb5255ca182b9244081 /notes
parent	89b5f51e57d3a0cc043640262e396e28297e7c00 (diff)
download	sandcrawler-trawler.tar.gz sandcrawler-trawler.zip