Quick notes on how daily ingest is going, circa September/October 2020.


    SELECT ingest_request.ingest_type,
           date(ingest_request.created),
           COUNT(*) as total,
           COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
    FROM ingest_file_result
    LEFT JOIN ingest_request
        ON ingest_file_result.ingest_type = ingest_request.ingest_type
        AND ingest_file_result.base_url = ingest_request.base_url
    WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
        AND ingest_request.ingest_type = 'pdf'
        AND ingest_request.ingest_request_source = 'fatcat-changelog'
    GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
    ORDER BY date(ingest_request.created) DESC;

     ingest_type |    date    | total | success
    -------------+------------+-------+---------
     pdf         | 2020-10-10 |  6145 |    1368
     pdf         | 2020-10-09 | 28453 |    6461
     pdf         | 2020-10-08 | 15105 |    3803
     pdf         | 2020-10-07 | 34213 |   10813
     pdf         | 2020-10-06 | 22263 |    8565
     pdf         | 2020-10-05 |  7910 |    3200
     pdf         | 2020-10-04 | 10865 |    4579
     pdf         | 2020-10-03 | 27745 |   10818
     pdf         | 2020-10-02 | 34320 |   13523
     pdf         | 2020-10-01 | 32548 |   13252
     pdf         | 2020-09-30 | 34798 |   14113
     pdf         | 2020-09-29 | 22463 |    8328
     pdf         | 2020-09-28 |  4117 |    1278
     pdf         | 2020-09-27 |  5894 |    1732
     pdf         | 2020-09-26 | 34949 |   13901
     pdf         | 2020-09-25 | 33680 |   10605
     pdf         | 2020-09-24 | 15125 |    5785
     pdf         | 2020-09-23 | 20866 |    6584
     pdf         | 2020-09-22 | 20949 |    7167
     pdf         | 2020-09-21 | 22483 |    7308
     pdf         | 2020-09-20 | 45644 |   16981
     pdf         | 2020-09-19 | 95571 |   31991
     pdf         | 2020-09-18 | 50849 |   15875
     pdf         | 2020-09-17 | 20121 |    3158
     pdf         | 2020-09-16 | 39184 |   12150
     pdf         | 2020-09-15 | 16986 |    7705
    (26 rows)


    SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
        FROM ingest_file_result
        LEFT JOIN ingest_request
            ON ingest_file_result.ingest_type = ingest_request.ingest_type
            AND ingest_file_result.base_url = ingest_request.base_url
        WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
            AND ingest_request.ingest_type = 'pdf'
            AND ingest_request.ingest_request_source = 'fatcat-changelog'
        GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
        ORDER BY COUNT DESC
        LIMIT 20;

     ingest_type |               status                | count
    -------------+-------------------------------------+--------
     pdf         | success                             | 241047
     pdf         | no-pdf-link                         | 143084
     pdf         | spn2-cdx-lookup-failure             | 108311
     pdf         | gateway-timeout                     |  97250
     pdf         | cdx-error                           |  61820
     pdf         | link-loop                           |  31350
     pdf         | wayback-error                       |   9139
     pdf         | spn2-error:job-failed               |   4240
     pdf         | spn2-error                          |   3893
     pdf         | wrong-mimetype                      |   1010
     pdf         | no-capture                          |    851
     pdf         | null-body                           |    605
     pdf         | redirect-loop                       |    261
     pdf         | spn2-error:soft-time-limit-exceeded |    126
     pdf         | terminal-bad-status                 |    120
     pdf         | petabox-error                       |    105
     pdf         | timeout                             |     29
     pdf         | spn2-error:no-status                |      2
     pdf         | spn2-error:invalid-server-response  |      2
     pdf         | bad-gzip-encoding                   |      1
    (20 rows)

    SELECT domain, status, COUNT((domain, status))
        FROM (
            SELECT
                ingest_file_result.ingest_type,
                ingest_file_result.status,
                substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
            FROM ingest_file_result
            LEFT JOIN ingest_request
                ON ingest_file_result.ingest_type = ingest_request.ingest_type
                AND ingest_file_result.base_url = ingest_request.base_url
            WHERE
                -- ingest_request.created >= NOW() - '3 day'::INTERVAL
                ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
                AND ingest_request.ingest_type = 'pdf'
                AND ingest_request.ingest_request_source = 'fatcat-changelog'
        ) t1
        WHERE t1.domain != ''
            AND t1.status != 'success'
        GROUP BY domain, status
        ORDER BY COUNT DESC
        LIMIT 25; 


                domain            |         status          | count
    ------------------------------+-------------------------+-------
     zenodo.org                   | no-pdf-link             | 52767
     www.degruyter.com            | link-loop               | 17666
     www.degruyter.com            | spn2-cdx-lookup-failure | 17597
     ieeexplore.ieee.org          | gateway-timeout         | 15290
     www.sciencedirect.com        | no-pdf-link             | 14043
     apps.crossref.org            | no-pdf-link             | 11531
     figshare.com                 | no-pdf-link             |  8966
     tandf.figshare.com           | no-pdf-link             |  7276
     zenodo.org                   | no-capture              |  7191
     springernature.figshare.com  | no-pdf-link             |  6485
     www.taylorfrancis.com        | link-loop               |  6266
     www.persee.fr                | terminal-bad-status     |  6031
     journals.openedition.org     | gateway-timeout         |  5639
     www.cairn.info               | link-loop               |  5618
     archaeologydataservice.ac.uk | no-pdf-link             |  5359
     www.taylorfrancis.com        | spn2-cdx-lookup-failure |  4748
     www.e-periodica.ch           | no-pdf-link             |  4722
     osf.io                       | no-capture              |  4247
     cancerres.aacrjournals.org   | no-pdf-link             |  4136
     dlc.library.columbia.edu     | no-pdf-link             |  4085
     www.egms.de                  | no-pdf-link             |  3304
     journals.lww.com             | no-pdf-link             |  3218
     journals.plos.org            | no-pdf-link             |  3005
     linkinghub.elsevier.com      | gateway-timeout         |  2833
     www.egms.de                  | redirect-loop           |  2606
    (25 rows)

    SELECT domain, status, COUNT((domain, status))
    FROM (
        SELECT
            ingest_file_result.ingest_type,
            ingest_file_result.status,
            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
        FROM ingest_file_result
        LEFT JOIN ingest_request
            ON ingest_file_result.ingest_type = ingest_request.ingest_type
            AND ingest_file_result.base_url = ingest_request.base_url
        WHERE
            -- ingest_request.created >= NOW() - '3 day'::INTERVAL
            ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
            AND ingest_request.ingest_type = 'pdf'
            AND ingest_request.ingest_request_source = 'fatcat-changelog'
    ) t1
    WHERE t1.domain != ''
        AND t1.status = 'success'
    GROUP BY domain, status
    ORDER BY COUNT DESC
    LIMIT 25; 

                    domain                | status  | count
    --------------------------------------+---------+-------
     zenodo.org                           | success | 55549
     arxiv.org                            | success | 24450
     s3-eu-west-1.amazonaws.com           | success | 18156
     res.mdpi.com                         | success | 13493
     www.degruyter.com                    | success | 12009
     journals.openedition.org             | success | 11235
     www.jstage.jst.go.jp                 | success |  9460
     peer.asee.org                        | success |  9416
     www.e-periodica.ch                   | success |  8105
     ir.canterbury.ac.nz                  | success |  6381
     europepmc.org                        | success |  5670
     www.repository.cam.ac.uk             | success |  4858
     assets.researchsquare.com            | success |  4765
     fjfsdata01prod.blob.core.windows.net | success |  4130
     tidsskrift.dk                        | success |  3964
     research-journal.org                 | success |  3127
     ieeexplore.ieee.org                  | success |  2947
     dergipark.org.tr                     | success |  2892
     watermark.silverchair.com            | success |  2315
     journals.plos.org                    | success |  2304
     journal.fi                           | success |  1996
     publications.rwth-aachen.de          | success |  1954
     www.brazilianjournals.com            | success |  1637
     article.sciencepublishinggroup.com   | success |  1589
     revistas.upr.edu                     | success |  1467
    (25 rows)

Casual take-aways:
- wonder what `apps.crossref.org` is
- sciencedirect crawling broken?
- figshare might be broken? or just very little success
- seems like a lot of journals.plos.org failures