From 1695c1bcd0d084a2004ce24bc01fd4515b49c06d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 17 Oct 2020 17:11:16 -0700 Subject: notes/status on daily ingest --- notes/ingest/2020-10_daily.md | 193 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 notes/ingest/2020-10_daily.md (limited to 'notes') diff --git a/notes/ingest/2020-10_daily.md b/notes/ingest/2020-10_daily.md new file mode 100644 index 0000000..d2bb50b --- /dev/null +++ b/notes/ingest/2020-10_daily.md @@ -0,0 +1,193 @@ + +Quick notes on how daily ingest is going, circa September/October 2020. + + + SELECT ingest_request.ingest_type, + date(ingest_request.created), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created) + ORDER BY date(ingest_request.created) DESC; + + ingest_type | date | total | success + -------------+------------+-------+--------- + pdf | 2020-10-10 | 6145 | 1368 + pdf | 2020-10-09 | 28453 | 6461 + pdf | 2020-10-08 | 15105 | 3803 + pdf | 2020-10-07 | 34213 | 10813 + pdf | 2020-10-06 | 22263 | 8565 + pdf | 2020-10-05 | 7910 | 3200 + pdf | 2020-10-04 | 10865 | 4579 + pdf | 2020-10-03 | 27745 | 10818 + pdf | 2020-10-02 | 34320 | 13523 + pdf | 2020-10-01 | 32548 | 13252 + pdf | 2020-09-30 | 34798 | 14113 + pdf | 2020-09-29 | 22463 | 8328 + pdf | 2020-09-28 | 4117 | 1278 + pdf | 2020-09-27 | 5894 | 1732 + pdf | 2020-09-26 | 34949 | 13901 + pdf | 2020-09-25 | 33680 | 10605 + pdf | 2020-09-24 | 15125 | 5785 + pdf | 2020-09-23 | 20866 | 6584 + pdf | 2020-09-22 | 20949 | 7167 + pdf | 2020-09-21 | 22483 | 7308 + pdf | 2020-09-20 | 45644 | 16981 + pdf | 2020-09-19 | 95571 | 31991 + pdf | 2020-09-18 | 50849 | 15875 + pdf | 2020-09-17 | 20121 | 3158 + pdf | 2020-09-16 | 39184 | 12150 + pdf | 2020-09-15 | 16986 | 7705 + (26 rows) + + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + + ingest_type | status | count + -------------+-------------------------------------+-------- + pdf | success | 241047 + pdf | no-pdf-link | 143084 + pdf | spn2-cdx-lookup-failure | 108311 + pdf | gateway-timeout | 97250 + pdf | cdx-error | 61820 + pdf | link-loop | 31350 + pdf | wayback-error | 9139 + pdf | spn2-error:job-failed | 4240 + pdf | spn2-error | 3893 + pdf | wrong-mimetype | 1010 + pdf | no-capture | 851 + pdf | null-body | 605 + pdf | redirect-loop | 261 + pdf | spn2-error:soft-time-limit-exceeded | 126 + pdf | terminal-bad-status | 120 + pdf | petabox-error | 105 + pdf | timeout | 29 + pdf | spn2-error:no-status | 2 + pdf | spn2-error:invalid-server-response | 2 + pdf | bad-gzip-encoding | 1 + (20 rows) + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + + domain | status | count + ------------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 52767 + www.degruyter.com | link-loop | 17666 + www.degruyter.com | spn2-cdx-lookup-failure | 17597 + ieeexplore.ieee.org | gateway-timeout | 15290 + www.sciencedirect.com | no-pdf-link | 14043 + apps.crossref.org | no-pdf-link | 11531 + figshare.com | no-pdf-link | 8966 + tandf.figshare.com | no-pdf-link | 7276 + zenodo.org | no-capture | 7191 + springernature.figshare.com | no-pdf-link | 6485 + www.taylorfrancis.com | link-loop | 6266 + www.persee.fr | terminal-bad-status | 6031 + journals.openedition.org | gateway-timeout | 5639 + www.cairn.info | link-loop | 5618 + archaeologydataservice.ac.uk | no-pdf-link | 5359 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 4748 + www.e-periodica.ch | no-pdf-link | 4722 + osf.io | no-capture | 4247 + cancerres.aacrjournals.org | no-pdf-link | 4136 + dlc.library.columbia.edu | no-pdf-link | 4085 + www.egms.de | no-pdf-link | 3304 + journals.lww.com | no-pdf-link | 3218 + journals.plos.org | no-pdf-link | 3005 + linkinghub.elsevier.com | gateway-timeout | 2833 + www.egms.de | redirect-loop | 2606 + (25 rows) + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + --------------------------------------+---------+------- + zenodo.org | success | 55549 + arxiv.org | success | 24450 + s3-eu-west-1.amazonaws.com | success | 18156 + res.mdpi.com | success | 13493 + www.degruyter.com | success | 12009 + journals.openedition.org | success | 11235 + www.jstage.jst.go.jp | success | 9460 + peer.asee.org | success | 9416 + www.e-periodica.ch | success | 8105 + ir.canterbury.ac.nz | success | 6381 + europepmc.org | success | 5670 + www.repository.cam.ac.uk | success | 4858 + assets.researchsquare.com | success | 4765 + fjfsdata01prod.blob.core.windows.net | success | 4130 + tidsskrift.dk | success | 3964 + research-journal.org | success | 3127 + ieeexplore.ieee.org | success | 2947 + dergipark.org.tr | success | 2892 + watermark.silverchair.com | success | 2315 + journals.plos.org | success | 2304 + journal.fi | success | 1996 + publications.rwth-aachen.de | success | 1954 + www.brazilianjournals.com | success | 1637 + article.sciencepublishinggroup.com | success | 1589 + revistas.upr.edu | success | 1467 + (25 rows) + +Casual take-aways: +- wonder what `apps.crossref.org` is +- sciencedirect crawling broken? +- figshare might be broken? or just very little success +- seems like a lot of journals.plos.org failures -- cgit v1.2.3