Want to test recent updates of GROBID (to fix regex issue), and also re-process
a number of PDFs which failed to process with GROBID initially.


## HTTP 503

These are attempts which failed because GROBID was too busy or not running.

    # IMPROVED BELOW
    COPY (
        SELECT row_to_json(cdx)
        FROM grobid
        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
        WHERE
            grobid.status_code = 503
            AND cdx.sha1hex IS NOT NULL
        -- LIMIT 5;
    )
    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
    WITH NULL '';
    # COPY 4749

Not actually that many, which seems good. Confirm that these are uniq by sha1hex:

    cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l
    # 302

Nope! Need to add "distinct on":

    COPY (
        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
        FROM grobid
        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
        WHERE
            grobid.status_code = 503
            AND cdx.sha1hex IS NOT NULL
        -- LIMIT 5;
    )
    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json'
    WITH NULL '';
    # COPY 4297

    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1

## Never Processed CDX

PDFs in fatcat which have never been processed with GROBID.

    COPY (
        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
        FROM fatcat_file
        LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
        LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex
        LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
        WHERE
            grobid.sha1hex IS NULL
            AND cdx.sha1hex IS NOT NULL
            AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
        -- LIMIT 5;
    )
    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json'
    WITH NULL '';
    # COPY 15488

    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1


PDFs in fatcat which have never been processed with pdfextract.

    # TODO
    COPY (
        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
        FROM fatcat_file
        LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex
        LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex
        LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex
        WHERE
            pdf_meta.sha1hex IS NULL
            AND cdx.sha1hex IS NOT NULL
            AND cdx.mimetype = 'application/pdf'
            AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL)
        -- LIMIT 5;
    )
    TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json'
    WITH NULL '';
    # COPY 45535

    cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \
        | rg -v "\\\\" \
        | jq . -c \
        | pv -l \
        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1
    # 45.5k 0:00:01 [30.2k/s]

## Timeout or Failure

    COPY (
        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
        FROM grobid
        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
        LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
        WHERE
            (grobid.status_code = 500 OR grobid.status_code = -4)
            AND cdx.sha1hex IS NOT NULL
            AND file_meta.mimetype  = 'application/pdf'
        -- LIMIT 5;
    )
    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json'
    WITH NULL '';
    # COPY 8,084,296

    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1

This seems to not be working very well, mostly errors, empty docs, etc. Will
roll-forward the kafka consumer group after attempting a couple hundred
thousand of these.

Let's try limiting to files actually in fatcat:

    COPY (
        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
        FROM grobid
        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
        LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex
        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
        WHERE
            (grobid.status_code = 500 OR grobid.status_code = -4)
            AND cdx.sha1hex IS NOT NULL
            AND fatcat_file.sha1hex IS NOT NULL
            AND file_meta.mimetype  = 'application/pdf'
            -- sort of arbitary "not recently" date filter
            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
        -- LIMIT 5;
    )
    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json'
    WITH NULL '';
    # COPY 529265

That is a much more managable batch to retry.

    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \
        | rg -v "\\\\" \
        | jq . -c \
        | pv -l \
        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
    # 529k 0:00:17 [31.0k/s]


## Missing Fatcat Files

There were around a half million fatcat file entities which didn't have `cdx`
rows in sandcrawler. Did some specific pdfextract processing; now we should do
GROBID ingest as well.

Enque the `CDX` objects for GROBID and pdfextract processing:

    zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
        | rg -v "\\\\" \
        | jq . -c \
        | pv -l \
        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
    # 354k 0:00:11 [30.6k/s]

    zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \
        | rg -v "\\\\" \
        | jq . -c \
        | pv -l \
        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1

And some earlier files of interest on `aitio`:

    cat files_missing_sha256.ingest_results.json \
        | rg '"application/pdf"' \
        | rg -v "\\\\" \
        | jq .cdx -c \
        | sort -u -S 4G \
        | pv -l \
        > files_missing_sha256.cdx.uniq.json
    # 100k 0:00:47 [2.09k/s]

    cat files_missing_sha256.cdx.uniq.json \
        | rg -v "\\\\" \
        | jq . -c \
        | pv -l \
        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1

    cat files_missing_sha256.cdx.uniq.json \
        | rg -v "\\\\" \
        | jq . -c \
        | pv -l \
        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1


## Ancient Fatcat Files

Files from an era where we didn't record GROBID version or status, even for
success.

    COPY (
        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
        FROM grobid
        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
        WHERE
            grobid.status_code = 200
            AND grobid.status IS NULL
            AND cdx.sha1hex IS NOT NULL
            AND fatcat_file.sha1hex IS NOT NULL
            -- sort of arbitary "not recently" date filter
            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
        -- LIMIT 5;
    )
    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json'
    WITH NULL '';

    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \
        | rg -v "\\\\" \
        | jq . -c \
        | pv -l \
        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1
    # 107k 0:00:03 [29.9k/s]


## Start Re-Processing Old GROBID Versions

    COPY (
        SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx)
        FROM grobid
        LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex
        LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
        WHERE
            grobid.status = 'success'
            AND grobid.grobid_version NOT LIKE '0.7.%'
            AND cdx.sha1hex IS NOT NULL
            AND fatcat_file.sha1hex IS NOT NULL
            -- sort of arbitary "not recently" date filter
            AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15')
        -- LIMIT 5;
    )
    TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json'
    WITH NULL '';

This one is huge, and want to process in batches/chunks of ~8 million at a time.

    cd /srv/sandcrawler/tasks/
    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \
        | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json

Submit individual batches like:

    cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \
        | rg -v "\\\\" \
        | jq . -c \
        | pv -l \
        | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1

Overall progress:

    x  ungrobided_fatcat.2021-12-11.grobid_old.split_00.json
    x  ungrobided_fatcat.2021-12-11.grobid_old.split_01.json
    x  ungrobided_fatcat.2021-12-11.grobid_old.split_02.json
    x  ungrobided_fatcat.2021-12-11.grobid_old.split_03.json
    .  ungrobided_fatcat.2021-12-11.grobid_old.split_04.json
    => ungrobided_fatcat.2021-12-11.grobid_old.split_05.json
    => ungrobided_fatcat.2021-12-11.grobid_old.split_06.json
    => ungrobided_fatcat.2021-12-11.grobid_old.split_07.json
    => ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small)

## General Counts

How many fatcat files of what mimetype (reported in sandcrawler-db)?

    SELECT file_meta.mimetype, COUNT(*)
    FROM fatcat_file
    LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex
    WHERE
        fatcat_file.first_release_ident IS NOT NULL
        AND fatcat_file.any_url = true
        AND content_scope IS NULL
    GROUP BY file_meta.mimetype
    ORDER BY COUNT(*) DESC
    LIMIT 25;

                                     mimetype                                  |  count
    ---------------------------------------------------------------------------+----------
     application/pdf                                                           | 45227033
                                                                               |   433068
     application/octet-stream                                                  |    30634
     application/jats+xml                                                      |     6874
     text/html                                                                 |      876
     application/postscript                                                    |      199
     application/gzip                                                          |      173
     text/plain                                                                |       84
     application/xml                                                           |       48
     application/vnd.ms-powerpoint                                             |       38
     application/msword                                                        |       16
     application/vnd.openxmlformats-officedocument.wordprocessingml.document   |        8
     image/jpeg                                                                |        6
     application/vnd.openxmlformats-officedocument.presentationml.presentation |        4
     message/rfc822                                                            |        4
     application/zip                                                           |        4
     text/x-tex                                                                |        3
     application/x-dosexec                                                     |        3
     application/x-tar                                                         |        2
     application/vnd.ms-tnef                                                   |        2
     image/svg+xml                                                             |        1
     image/tiff                                                                |        1
     image/png                                                                 |        1
     image/gif                                                                 |        1
     application/vnd.ms-office                                                 |        1
    (25 rows)


PDF extract status?

    SELECT pdf_meta.status, COUNT(*)
    FROM fatcat_file
    LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex
    WHERE
        fatcat_file.first_release_ident IS NOT NULL
        AND fatcat_file.any_url = true
        AND content_scope IS NULL
    GROUP BY pdf_meta.status
    ORDER BY COUNT(*) DESC
    LIMIT 25;

         status     |  count
    ----------------+----------
     success        | 43415920
                    |  2018522
     text-too-large |   122730
     parse-error    |    94876
     not-pdf        |    32156
     error-wayback  |    14504
     bad-unicode    |      279
     bad-pdf        |       98
     empty-blob     |        2
    (9 rows)


What are the GROBID status codes for fatcat files? Narrowed down:

    SELECT grobid.status, grobid.status_code, COUNT(*)
    FROM fatcat_file
    LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex
    WHERE
        fatcat_file.first_release_ident IS NOT NULL
        AND fatcat_file.any_url = true
        AND content_scope IS NULL
    GROUP BY grobid.status, grobid.status_code
    ORDER BY COUNT(*) DESC
    LIMIT 25;

         status     | status_code |  count
    ----------------+-------------+----------
     success        |         200 | 44409069
     error          |         500 |   580402
                    |             |   468836
                    |         200 |   240660
     error-timeout  |          -4 |       79
     bad-grobid-xml |         200 |       38
     error          |         200 |        3
    (7 rows)

Ran the same query again on 2021-12-15:

         status     | status_code |  count   
    ----------------+-------------+----------
     success        |         200 | 45092915
     error          |         500 |   302373
                    |             |   250335
                    |         200 |    53352
     bad-grobid-xml |         200 |       39
     error-timeout  |          -4 |       37
     error          |         200 |       34
     error          |         503 |        2
    (8 rows)