diff options
Diffstat (limited to 'notes')
| -rw-r--r-- | notes/tasks/2021-12-06_regrobid.md | 289 | 
1 files changed, 289 insertions, 0 deletions
| diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md new file mode 100644 index 0000000..65e9fe3 --- /dev/null +++ b/notes/tasks/2021-12-06_regrobid.md @@ -0,0 +1,289 @@ + +Want to test recent updates of GROBID (to fix regex issue), and also re-process +a number of PDFs which failed to process with GROBID initially. + + +## HTTP 503 + +These are attempts which failed because GROBID was too busy or not running. + +    # IMPROVED BELOW +    COPY ( +        SELECT row_to_json(cdx) +        FROM grobid +        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex +        WHERE +            grobid.status_code = 503 +            AND cdx.sha1hex IS NOT NULL +        -- LIMIT 5; +    ) +    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json' +    WITH NULL ''; +    # COPY 4749 + +Not actually that many, which seems good. Confirm that these are uniq by sha1hex: + +    cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l +    # 302 + +Nope! Need to add "distinct on": + +    COPY ( +        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) +        FROM grobid +        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex +        WHERE +            grobid.status_code = 503 +            AND cdx.sha1hex IS NOT NULL +        -- LIMIT 5; +    ) +    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json' +    WITH NULL ''; +    # COPY 4297 + +    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +## Never Processed CDX + +PDFs in fatcat which have never been processed with GROBID. + +    COPY ( +        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) +        FROM fatcat_file +        LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex +        LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex +        LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex +        WHERE +            grobid.sha1hex IS NULL +            AND cdx.sha1hex IS NOT NULL +            AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL) +        -- LIMIT 5; +    ) +    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json' +    WITH NULL ''; +    # COPY 15488 + +    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + +PDFs in fatcat which have never been processed with pdfextract. + +    # TODO +    COPY ( +        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) +        FROM fatcat_file +        LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex +        LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex +        LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex +        WHERE +            pdf_meta.sha1hex IS NULL +            AND cdx.sha1hex IS NOT NULL +            AND cdx.mimetype = 'application/pdf' +            AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL) +        -- LIMIT 5; +    ) +    TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json' +    WITH NULL ''; +    # COPY 45535 + +    cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 +    # 45.5k 0:00:01 [30.2k/s] + +## Timeout or Failure + +    COPY ( +        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) +        FROM grobid +        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex +        LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex +        WHERE +            (grobid.status_code = 500 OR grobid.status_code = -4) +            AND cdx.sha1hex IS NOT NULL +            AND file_meta.mimetype  = 'application/pdf' +        -- LIMIT 5; +    ) +    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json' +    WITH NULL ''; +    # COPY 8,084,296 + +    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +This seems to not be working very well, mostly errors, empty docs, etc. Will +roll-forward the kafka consumer group after attempting a couple hundred +thousand of these. + +Let's try limiting to files actually in fatcat: + +    COPY ( +        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) +        FROM grobid +        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex +        LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex +        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex +        WHERE +            (grobid.status_code = 500 OR grobid.status_code = -4) +            AND cdx.sha1hex IS NOT NULL +            AND fatcat_file.sha1hex IS NOT NULL +            AND file_meta.mimetype  = 'application/pdf' +            -- sort of arbitary "not recently" date filter +            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') +        -- LIMIT 5; +    ) +    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json' +    WITH NULL ''; +    # COPY 529265 + +That is a much more managable batch to retry. + +    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 +    # 529k 0:00:17 [31.0k/s] + + +## Missing Fatcat Files + +There were around a half million fatcat file entities which didn't have `cdx` +rows in sandcrawler. Did some specific pdfextract processing; now we should do +GROBID ingest as well. + +Enque the `CDX` objects for GROBID and pdfextract processing: + +    zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 +    # 354k 0:00:11 [30.6k/s] + +    zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +And some earlier files of interest on `aitio`: + +    cat files_missing_sha256.ingest_results.json \ +        | rg '"application/pdf"' \ +        | rg -v "\\\\" \ +        | jq .cdx -c \ +        | sort -u -S 4G \ +        | pv -l \ +        > files_missing_sha256.cdx.uniq.json +    # 100k 0:00:47 [2.09k/s] + +    cat files_missing_sha256.cdx.uniq.json \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +    cat files_missing_sha256.cdx.uniq.json \ +        | rg -v "\\\\" \ +        | jq . -c \ +        | pv -l \ +        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +## General Counts + +How many fatcat files of what mimetype (reported in sandcrawler-db)? + +    SELECT file_meta.mimetype, COUNT(*) +    FROM fatcat_file +    LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex +    WHERE +        fatcat_file.first_release_ident IS NOT NULL +        AND fatcat_file.any_url = true +        AND content_scope IS NULL +    GROUP BY file_meta.mimetype +    ORDER BY COUNT(*) DESC +    LIMIT 25; + +                                     mimetype                                  |  count +    ---------------------------------------------------------------------------+---------- +     application/pdf                                                           | 45227033 +                                                                               |   433068 +     application/octet-stream                                                  |    30634 +     application/jats+xml                                                      |     6874 +     text/html                                                                 |      876 +     application/postscript                                                    |      199 +     application/gzip                                                          |      173 +     text/plain                                                                |       84 +     application/xml                                                           |       48 +     application/vnd.ms-powerpoint                                             |       38 +     application/msword                                                        |       16 +     application/vnd.openxmlformats-officedocument.wordprocessingml.document   |        8 +     image/jpeg                                                                |        6 +     application/vnd.openxmlformats-officedocument.presentationml.presentation |        4 +     message/rfc822                                                            |        4 +     application/zip                                                           |        4 +     text/x-tex                                                                |        3 +     application/x-dosexec                                                     |        3 +     application/x-tar                                                         |        2 +     application/vnd.ms-tnef                                                   |        2 +     image/svg+xml                                                             |        1 +     image/tiff                                                                |        1 +     image/png                                                                 |        1 +     image/gif                                                                 |        1 +     application/vnd.ms-office                                                 |        1 +    (25 rows) + + +PDF extract status? + +    SELECT pdf_meta.status, COUNT(*) +    FROM fatcat_file +    LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex +    WHERE +        fatcat_file.first_release_ident IS NOT NULL +        AND fatcat_file.any_url = true +        AND content_scope IS NULL +    GROUP BY pdf_meta.status +    ORDER BY COUNT(*) DESC +    LIMIT 25; + +         status     |  count +    ----------------+---------- +     success        | 43415920 +                    |  2018522 +     text-too-large |   122730 +     parse-error    |    94876 +     not-pdf        |    32156 +     error-wayback  |    14504 +     bad-unicode    |      279 +     bad-pdf        |       98 +     empty-blob     |        2 +    (9 rows) + + +What are the GROBID status codes for fatcat files? Narrowed down: + +    SELECT grobid.status, grobid.status_code, COUNT(*) +    FROM fatcat_file +    LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex +    WHERE +        fatcat_file.first_release_ident IS NOT NULL +        AND fatcat_file.any_url = true +        AND content_scope IS NULL +    GROUP BY grobid.status, grobid.status_code +    ORDER BY COUNT(*) DESC +    LIMIT 25; + +         status     | status_code |  count +    ----------------+-------------+---------- +     success        |         200 | 44409069 +     error          |         500 |   580402 +                    |             |   468836 +                    |         200 |   240660 +     error-timeout  |          -4 |       79 +     bad-grobid-xml |         200 |       38 +     error          |         200 |        3 +    (7 rows) + | 
