Quick notes on how daily ingest is going, circa September/October 2020. SELECT ingest_request.ingest_type, date(ingest_request.created), COUNT(*) as total, COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success FROM ingest_file_result LEFT JOIN ingest_request ON ingest_file_result.ingest_type = ingest_request.ingest_type AND ingest_file_result.base_url = ingest_request.base_url WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL AND ingest_request.ingest_type = 'pdf' AND ingest_request.ingest_request_source = 'fatcat-changelog' GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created) ORDER BY date(ingest_request.created) DESC; ingest_type | date | total | success -------------+------------+-------+--------- pdf | 2020-10-10 | 6145 | 1368 pdf | 2020-10-09 | 28453 | 6461 pdf | 2020-10-08 | 15105 | 3803 pdf | 2020-10-07 | 34213 | 10813 pdf | 2020-10-06 | 22263 | 8565 pdf | 2020-10-05 | 7910 | 3200 pdf | 2020-10-04 | 10865 | 4579 pdf | 2020-10-03 | 27745 | 10818 pdf | 2020-10-02 | 34320 | 13523 pdf | 2020-10-01 | 32548 | 13252 pdf | 2020-09-30 | 34798 | 14113 pdf | 2020-09-29 | 22463 | 8328 pdf | 2020-09-28 | 4117 | 1278 pdf | 2020-09-27 | 5894 | 1732 pdf | 2020-09-26 | 34949 | 13901 pdf | 2020-09-25 | 33680 | 10605 pdf | 2020-09-24 | 15125 | 5785 pdf | 2020-09-23 | 20866 | 6584 pdf | 2020-09-22 | 20949 | 7167 pdf | 2020-09-21 | 22483 | 7308 pdf | 2020-09-20 | 45644 | 16981 pdf | 2020-09-19 | 95571 | 31991 pdf | 2020-09-18 | 50849 | 15875 pdf | 2020-09-17 | 20121 | 3158 pdf | 2020-09-16 | 39184 | 12150 pdf | 2020-09-15 | 16986 | 7705 (26 rows) SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) FROM ingest_file_result LEFT JOIN ingest_request ON ingest_file_result.ingest_type = ingest_request.ingest_type AND ingest_file_result.base_url = ingest_request.base_url WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL AND ingest_request.ingest_type = 'pdf' AND ingest_request.ingest_request_source = 'fatcat-changelog' GROUP BY ingest_file_result.ingest_type, ingest_file_result.status ORDER BY COUNT DESC LIMIT 20; ingest_type | status | count -------------+-------------------------------------+-------- pdf | success | 241047 pdf | no-pdf-link | 143084 pdf | spn2-cdx-lookup-failure | 108311 pdf | gateway-timeout | 97250 pdf | cdx-error | 61820 pdf | link-loop | 31350 pdf | wayback-error | 9139 pdf | spn2-error:job-failed | 4240 pdf | spn2-error | 3893 pdf | wrong-mimetype | 1010 pdf | no-capture | 851 pdf | null-body | 605 pdf | redirect-loop | 261 pdf | spn2-error:soft-time-limit-exceeded | 126 pdf | terminal-bad-status | 120 pdf | petabox-error | 105 pdf | timeout | 29 pdf | spn2-error:no-status | 2 pdf | spn2-error:invalid-server-response | 2 pdf | bad-gzip-encoding | 1 (20 rows) SELECT domain, status, COUNT((domain, status)) FROM ( SELECT ingest_file_result.ingest_type, ingest_file_result.status, substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result LEFT JOIN ingest_request ON ingest_file_result.ingest_type = ingest_request.ingest_type AND ingest_file_result.base_url = ingest_request.base_url WHERE -- ingest_request.created >= NOW() - '3 day'::INTERVAL ingest_file_result.updated >= NOW() - '30 day'::INTERVAL AND ingest_request.ingest_type = 'pdf' AND ingest_request.ingest_request_source = 'fatcat-changelog' ) t1 WHERE t1.domain != '' AND t1.status != 'success' GROUP BY domain, status ORDER BY COUNT DESC LIMIT 25; domain | status | count ------------------------------+-------------------------+------- zenodo.org | no-pdf-link | 52767 www.degruyter.com | link-loop | 17666 www.degruyter.com | spn2-cdx-lookup-failure | 17597 ieeexplore.ieee.org | gateway-timeout | 15290 www.sciencedirect.com | no-pdf-link | 14043 apps.crossref.org | no-pdf-link | 11531 figshare.com | no-pdf-link | 8966 tandf.figshare.com | no-pdf-link | 7276 zenodo.org | no-capture | 7191 springernature.figshare.com | no-pdf-link | 6485 www.taylorfrancis.com | link-loop | 6266 www.persee.fr | terminal-bad-status | 6031 journals.openedition.org | gateway-timeout | 5639 www.cairn.info | link-loop | 5618 archaeologydataservice.ac.uk | no-pdf-link | 5359 www.taylorfrancis.com | spn2-cdx-lookup-failure | 4748 www.e-periodica.ch | no-pdf-link | 4722 osf.io | no-capture | 4247 cancerres.aacrjournals.org | no-pdf-link | 4136 dlc.library.columbia.edu | no-pdf-link | 4085 www.egms.de | no-pdf-link | 3304 journals.lww.com | no-pdf-link | 3218 journals.plos.org | no-pdf-link | 3005 linkinghub.elsevier.com | gateway-timeout | 2833 www.egms.de | redirect-loop | 2606 (25 rows) SELECT domain, status, COUNT((domain, status)) FROM ( SELECT ingest_file_result.ingest_type, ingest_file_result.status, substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result LEFT JOIN ingest_request ON ingest_file_result.ingest_type = ingest_request.ingest_type AND ingest_file_result.base_url = ingest_request.base_url WHERE -- ingest_request.created >= NOW() - '3 day'::INTERVAL ingest_file_result.updated >= NOW() - '30 day'::INTERVAL AND ingest_request.ingest_type = 'pdf' AND ingest_request.ingest_request_source = 'fatcat-changelog' ) t1 WHERE t1.domain != '' AND t1.status = 'success' GROUP BY domain, status ORDER BY COUNT DESC LIMIT 25; domain | status | count --------------------------------------+---------+------- zenodo.org | success | 55549 arxiv.org | success | 24450 s3-eu-west-1.amazonaws.com | success | 18156 res.mdpi.com | success | 13493 www.degruyter.com | success | 12009 journals.openedition.org | success | 11235 www.jstage.jst.go.jp | success | 9460 peer.asee.org | success | 9416 www.e-periodica.ch | success | 8105 ir.canterbury.ac.nz | success | 6381 europepmc.org | success | 5670 www.repository.cam.ac.uk | success | 4858 assets.researchsquare.com | success | 4765 fjfsdata01prod.blob.core.windows.net | success | 4130 tidsskrift.dk | success | 3964 research-journal.org | success | 3127 ieeexplore.ieee.org | success | 2947 dergipark.org.tr | success | 2892 watermark.silverchair.com | success | 2315 journals.plos.org | success | 2304 journal.fi | success | 1996 publications.rwth-aachen.de | success | 1954 www.brazilianjournals.com | success | 1637 article.sciencepublishinggroup.com | success | 1589 revistas.upr.edu | success | 1467 (25 rows) Casual take-aways: - wonder what `apps.crossref.org` is - sciencedirect crawling broken? - figshare might be broken? or just very little success - seems like a lot of journals.plos.org failures