aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-17 17:11:16 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-17 17:11:16 -0700
commit1695c1bcd0d084a2004ce24bc01fd4515b49c06d (patch)
tree00db257aab8e77ca7a3b50fcb9e469b327ba939f
parente1698ffa62f0292ad9f8d0e53207dfa3aa23818d (diff)
downloadsandcrawler-1695c1bcd0d084a2004ce24bc01fd4515b49c06d.tar.gz
sandcrawler-1695c1bcd0d084a2004ce24bc01fd4515b49c06d.zip
notes/status on daily ingest
-rw-r--r--notes/ingest/2020-10_daily.md193
1 files changed, 193 insertions, 0 deletions
diff --git a/notes/ingest/2020-10_daily.md b/notes/ingest/2020-10_daily.md
new file mode 100644
index 0000000..d2bb50b
--- /dev/null
+++ b/notes/ingest/2020-10_daily.md
@@ -0,0 +1,193 @@
+
+Quick notes on how daily ingest is going, circa September/October 2020.
+
+
+ SELECT ingest_request.ingest_type,
+ date(ingest_request.created),
+ COUNT(*) as total,
+ COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
+ ORDER BY date(ingest_request.created) DESC;
+
+ ingest_type | date | total | success
+ -------------+------------+-------+---------
+ pdf | 2020-10-10 | 6145 | 1368
+ pdf | 2020-10-09 | 28453 | 6461
+ pdf | 2020-10-08 | 15105 | 3803
+ pdf | 2020-10-07 | 34213 | 10813
+ pdf | 2020-10-06 | 22263 | 8565
+ pdf | 2020-10-05 | 7910 | 3200
+ pdf | 2020-10-04 | 10865 | 4579
+ pdf | 2020-10-03 | 27745 | 10818
+ pdf | 2020-10-02 | 34320 | 13523
+ pdf | 2020-10-01 | 32548 | 13252
+ pdf | 2020-09-30 | 34798 | 14113
+ pdf | 2020-09-29 | 22463 | 8328
+ pdf | 2020-09-28 | 4117 | 1278
+ pdf | 2020-09-27 | 5894 | 1732
+ pdf | 2020-09-26 | 34949 | 13901
+ pdf | 2020-09-25 | 33680 | 10605
+ pdf | 2020-09-24 | 15125 | 5785
+ pdf | 2020-09-23 | 20866 | 6584
+ pdf | 2020-09-22 | 20949 | 7167
+ pdf | 2020-09-21 | 22483 | 7308
+ pdf | 2020-09-20 | 45644 | 16981
+ pdf | 2020-09-19 | 95571 | 31991
+ pdf | 2020-09-18 | 50849 | 15875
+ pdf | 2020-09-17 | 20121 | 3158
+ pdf | 2020-09-16 | 39184 | 12150
+ pdf | 2020-09-15 | 16986 | 7705
+ (26 rows)
+
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+--------
+ pdf | success | 241047
+ pdf | no-pdf-link | 143084
+ pdf | spn2-cdx-lookup-failure | 108311
+ pdf | gateway-timeout | 97250
+ pdf | cdx-error | 61820
+ pdf | link-loop | 31350
+ pdf | wayback-error | 9139
+ pdf | spn2-error:job-failed | 4240
+ pdf | spn2-error | 3893
+ pdf | wrong-mimetype | 1010
+ pdf | no-capture | 851
+ pdf | null-body | 605
+ pdf | redirect-loop | 261
+ pdf | spn2-error:soft-time-limit-exceeded | 126
+ pdf | terminal-bad-status | 120
+ pdf | petabox-error | 105
+ pdf | timeout | 29
+ pdf | spn2-error:no-status | 2
+ pdf | spn2-error:invalid-server-response | 2
+ pdf | bad-gzip-encoding | 1
+ (20 rows)
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+
+ domain | status | count
+ ------------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 52767
+ www.degruyter.com | link-loop | 17666
+ www.degruyter.com | spn2-cdx-lookup-failure | 17597
+ ieeexplore.ieee.org | gateway-timeout | 15290
+ www.sciencedirect.com | no-pdf-link | 14043
+ apps.crossref.org | no-pdf-link | 11531
+ figshare.com | no-pdf-link | 8966
+ tandf.figshare.com | no-pdf-link | 7276
+ zenodo.org | no-capture | 7191
+ springernature.figshare.com | no-pdf-link | 6485
+ www.taylorfrancis.com | link-loop | 6266
+ www.persee.fr | terminal-bad-status | 6031
+ journals.openedition.org | gateway-timeout | 5639
+ www.cairn.info | link-loop | 5618
+ archaeologydataservice.ac.uk | no-pdf-link | 5359
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 4748
+ www.e-periodica.ch | no-pdf-link | 4722
+ osf.io | no-capture | 4247
+ cancerres.aacrjournals.org | no-pdf-link | 4136
+ dlc.library.columbia.edu | no-pdf-link | 4085
+ www.egms.de | no-pdf-link | 3304
+ journals.lww.com | no-pdf-link | 3218
+ journals.plos.org | no-pdf-link | 3005
+ linkinghub.elsevier.com | gateway-timeout | 2833
+ www.egms.de | redirect-loop | 2606
+ (25 rows)
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ --------------------------------------+---------+-------
+ zenodo.org | success | 55549
+ arxiv.org | success | 24450
+ s3-eu-west-1.amazonaws.com | success | 18156
+ res.mdpi.com | success | 13493
+ www.degruyter.com | success | 12009
+ journals.openedition.org | success | 11235
+ www.jstage.jst.go.jp | success | 9460
+ peer.asee.org | success | 9416
+ www.e-periodica.ch | success | 8105
+ ir.canterbury.ac.nz | success | 6381
+ europepmc.org | success | 5670
+ www.repository.cam.ac.uk | success | 4858
+ assets.researchsquare.com | success | 4765
+ fjfsdata01prod.blob.core.windows.net | success | 4130
+ tidsskrift.dk | success | 3964
+ research-journal.org | success | 3127
+ ieeexplore.ieee.org | success | 2947
+ dergipark.org.tr | success | 2892
+ watermark.silverchair.com | success | 2315
+ journals.plos.org | success | 2304
+ journal.fi | success | 1996
+ publications.rwth-aachen.de | success | 1954
+ www.brazilianjournals.com | success | 1637
+ article.sciencepublishinggroup.com | success | 1589
+ revistas.upr.edu | success | 1467
+ (25 rows)
+
+Casual take-aways:
+- wonder what `apps.crossref.org` is
+- sciencedirect crawling broken?
+- figshare might be broken? or just very little success
+- seems like a lot of journals.plos.org failures