aboutsummaryrefslogtreecommitdiffstats
path: root/notes
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-09-08 12:16:44 -0700
committerBryan Newbold <bnewbold@archive.org>2021-09-08 12:16:44 -0700
commitce25d59845083ca0beab98144b0c43bfc4254d6d (patch)
tree4f52976982fabb3fed7a73de6602f5286c0c480e /notes
parent1a743b4fccb688d5b395e05a8ad71405f24e9c51 (diff)
downloadsandcrawler-ce25d59845083ca0beab98144b0c43bfc4254d6d.tar.gz
sandcrawler-ce25d59845083ca0beab98144b0c43bfc4254d6d.zip
daily OA crawl improvements/notes
Diffstat (limited to 'notes')
-rw-r--r--notes/ingest/2021-09-03_daily_improvements.md1021
1 files changed, 1021 insertions, 0 deletions
diff --git a/notes/ingest/2021-09-03_daily_improvements.md b/notes/ingest/2021-09-03_daily_improvements.md
new file mode 100644
index 0000000..a0bb0c5
--- /dev/null
+++ b/notes/ingest/2021-09-03_daily_improvements.md
@@ -0,0 +1,1021 @@
+
+Periodic check-in of daily crawling/ingest.
+
+Overall ingest status, past 30 days:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ ingest_type | status | count
+ -------------+-------------------------+--------
+ pdf | no-pdf-link | 158474
+ pdf | spn2-cdx-lookup-failure | 135344
+ pdf | success | 127938
+ pdf | spn2-error | 65411
+ pdf | gateway-timeout | 63112
+ pdf | blocked-cookie | 26338
+ pdf | terminal-bad-status | 24853
+ pdf | link-loop | 15699
+ pdf | spn2-error:job-failed | 13862
+ pdf | redirect-loop | 11432
+ pdf | cdx-error | 2376
+ pdf | too-many-redirects | 2186
+ pdf | wrong-mimetype | 2142
+ pdf | forbidden | 1758
+ pdf | spn2-error:no-status | 972
+ pdf | not-found | 820
+ pdf | bad-redirect | 536
+ pdf | read-timeout | 392
+ pdf | wayback-error | 251
+ pdf | remote-server-error | 220
+ (20 rows)
+
+Hrm, that is a healthy fraction of `no-pdf-link`.
+
+Broken domains, past 30 days:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ -- ingest_request.created >= NOW() - '3 day'::INTERVAL
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ -------------------------+-------------------------+-------
+ zenodo.org | no-pdf-link | 39678
+ osf.io | gateway-timeout | 29809
+ acervus.unicamp.br | no-pdf-link | 21978
+ osf.io | terminal-bad-status | 18727
+ zenodo.org | spn2-cdx-lookup-failure | 17008
+ doi.org | spn2-cdx-lookup-failure | 15503
+ www.degruyter.com | no-pdf-link | 15122
+ ieeexplore.ieee.org | spn2-error:job-failed | 12921
+ osf.io | spn2-cdx-lookup-failure | 11123
+ www.tandfonline.com | blocked-cookie | 8096
+ www.morressier.com | no-pdf-link | 4655
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 4580
+ pubs.acs.org | blocked-cookie | 4415
+ www.frontiersin.org | no-pdf-link | 4163
+ www.degruyter.com | spn2-cdx-lookup-failure | 3788
+ www.taylorfrancis.com | no-pdf-link | 3568
+ www.sciencedirect.com | no-pdf-link | 3128
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 3116
+ acervus.unicamp.br | spn2-cdx-lookup-failure | 2797
+ www.mdpi.com | spn2-cdx-lookup-failure | 2719
+ brill.com | link-loop | 2681
+ linkinghub.elsevier.com | spn2-cdx-lookup-failure | 2657
+ www.sciencedirect.com | spn2-cdx-lookup-failure | 2546
+ apps.crossref.org | no-pdf-link | 2537
+ onlinelibrary.wiley.com | blocked-cookie | 2528
+ (25 rows)
+
+Summary of significant domains and status, past 30 days, minus spn2-cdx-lookup-failure:
+
+ SELECT domain, status, count
+ FROM (
+ SELECT domain, status, COUNT((domain, status)) as count
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status != 'spn2-cdx-lookup-failure'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY CUBE (domain, status)
+ ) t2
+ WHERE count > 200
+ ORDER BY domain ASC , count DESC;
+
+
+ domain | status | count
+ -----------------------------------------------------------------+-----------------------+--------
+ academic.oup.com | | 2405
+ academic.oup.com | no-pdf-link | 1240
+ academic.oup.com | link-loop | 1010
+ acervus.unicamp.br | | 21980
+ acervus.unicamp.br | no-pdf-link | 21978 **
+ aclanthology.org | | 208
+ acp.copernicus.org | | 365
+ acp.copernicus.org | success | 356
+ aip.scitation.org | | 1071
+ aip.scitation.org | blocked-cookie | 843
+ aip.scitation.org | redirect-loop | 227
+ apps.crossref.org | | 2537
+ apps.crossref.org | no-pdf-link | 2537
+ arxiv.org | | 17817
+ arxiv.org | success | 17370
+ arxiv.org | terminal-bad-status | 320
+ asmedigitalcollection.asme.org | | 401
+ asmedigitalcollection.asme.org | link-loop | 364
+ assets.researchsquare.com | | 3706
+ assets.researchsquare.com | success | 3706
+ avmj.journals.ekb.eg | | 605
+ avmj.journals.ekb.eg | success | 595
+ bfa.journals.ekb.eg | | 224
+ bfa.journals.ekb.eg | success | 214
+ biorxiv.org | redirect-loop | 895
+ biorxiv.org | | 895
+ birdsoftheworld.org | | 286
+ birdsoftheworld.org | no-pdf-link | 285
+ bmjopen.bmj.com | success | 232
+ bmjopen.bmj.com | | 232
+ books.openedition.org | | 396
+ books.openedition.org | no-pdf-link | 396
+ brill.com | | 4272
+ brill.com | link-loop | 2681
+ brill.com | no-pdf-link | 1410
+ cas.columbia.edu | | 1038
+ cas.columbia.edu | no-pdf-link | 1038 **
+ cdr.lib.unc.edu | | 513
+ cdr.lib.unc.edu | success | 469
+ chemrxiv.org | | 278
+ chemrxiv.org | success | 275
+ classiques-garnier.com | | 531
+ classiques-garnier.com | no-pdf-link | 487 *
+ content.iospress.com | | 275
+ content.iospress.com | link-loop | 230
+ cris.maastrichtuniversity.nl | | 318
+ cris.maastrichtuniversity.nl | success | 284
+ cyberleninka.ru | | 1165
+ cyberleninka.ru | success | 1134
+ deepblue.lib.umich.edu | | 289
+ dergipark.org.tr | | 1185
+ dergipark.org.tr | success | 774
+ dergipark.org.tr | no-pdf-link | 320
+ didaktorika.gr | | 688
+ didaktorika.gr | redirect-loop | 688
+ digi.ub.uni-heidelberg.de | | 292
+ digi.ub.uni-heidelberg.de | no-pdf-link | 292
+ direct.mit.edu | | 236
+ direct.mit.edu | no-pdf-link | 207 *
+ dl.acm.org | | 2319
+ dl.acm.org | blocked-cookie | 2230
+ dmtcs.episciences.org | | 733
+ dmtcs.episciences.org | success | 730
+ doi.ala.org.au | no-pdf-link | 2373 **
+ doi.ala.org.au | | 2373
+ doi.org | | 732
+ doi.org | terminal-bad-status | 673
+ downloads.hindawi.com | success | 1452
+ downloads.hindawi.com | | 1452
+ drive.google.com | | 216
+ drive.google.com | no-pdf-link | 211
+ dtb.bmj.com | | 674
+ dtb.bmj.com | link-loop | 669
+ easy.dans.knaw.nl | no-pdf-link | 261 *
+ easy.dans.knaw.nl | | 261
+ ebooks.marilia.unesp.br | | 688
+ ebooks.marilia.unesp.br | no-pdf-link | 688 *
+ ehp.niehs.nih.gov | | 766
+ ehp.niehs.nih.gov | blocked-cookie | 765
+ ejournal.mandalanursa.org | | 307
+ ejournal.mandalanursa.org | success | 305
+ elib.spbstu.ru | | 264
+ elib.spbstu.ru | redirect-loop | 257
+ elibrary.ru | | 1367
+ elibrary.ru | redirect-loop | 1169
+ elibrary.vdi-verlag.de | | 1251
+ elibrary.vdi-verlag.de | no-pdf-link | 646
+ elibrary.vdi-verlag.de | link-loop | 537
+ elifesciences.org | | 328
+ elifesciences.org | success | 323
+ figshare.com | | 803
+ figshare.com | no-pdf-link | 714 *
+ files.osf.io | | 745
+ files.osf.io | success | 614
+ hammer.purdue.edu | | 244
+ hammer.purdue.edu | no-pdf-link | 243
+ heiup.uni-heidelberg.de | | 277
+ heiup.uni-heidelberg.de | no-pdf-link | 268
+ hkvalidate.perfdrive.com | no-pdf-link | 370 *
+ hkvalidate.perfdrive.com | | 370
+ ieeexplore.ieee.org | | 16675
+ ieeexplore.ieee.org | spn2-error:job-failed | 12927
+ ieeexplore.ieee.org | success | 1952
+ ieeexplore.ieee.org | too-many-redirects | 1193
+ ieeexplore.ieee.org | no-pdf-link | 419
+ jamanetwork.com | | 339
+ jamanetwork.com | success | 216
+ jmstt.ntou.edu.tw | | 244
+ jmstt.ntou.edu.tw | success | 241
+ journal.ipb.ac.id | | 229
+ journal.ipb.ac.id | success | 206
+ journal.nafe.org | | 221
+ journals.aps.org | | 614
+ journals.aps.org | gateway-timeout | 495
+ journals.asm.org | | 463
+ journals.asm.org | blocked-cookie | 435
+ journals.flvc.org | | 230
+ journals.lww.com | | 1300
+ journals.lww.com | link-loop | 1284
+ journals.openedition.org | | 543
+ journals.openedition.org | success | 311
+ journals.ub.uni-heidelberg.de | | 357
+ journals.ub.uni-heidelberg.de | success | 311
+ jov.arvojournals.org | | 431
+ jov.arvojournals.org | no-pdf-link | 422 *
+ kiss.kstudy.com | | 303
+ kiss.kstudy.com | no-pdf-link | 303 *
+ library.iated.org | | 364
+ library.iated.org | redirect-loop | 264
+ library.seg.org | blocked-cookie | 301
+ library.seg.org | | 301
+ link.aps.org | redirect-loop | 442
+ link.aps.org | | 442
+ linkinghub.elsevier.com | | 515
+ linkinghub.elsevier.com | gateway-timeout | 392
+ mc.sbm.org.br | | 224
+ mc.sbm.org.br | success | 224
+ mdpi-res.com | | 742
+ mdpi-res.com | success | 742
+ mdsoar.org | | 220
+ mediarep.org | | 269
+ mediarep.org | success | 264
+ medrxiv.org | redirect-loop | 290
+ medrxiv.org | | 290
+ muse.jhu.edu | | 429
+ muse.jhu.edu | terminal-bad-status | 391
+ mvmj.journals.ekb.eg | | 306
+ oapub.org | | 292
+ oapub.org | success | 289
+ onepetro.org | | 426
+ onepetro.org | link-loop | 406
+ onlinelibrary.wiley.com | | 2835
+ onlinelibrary.wiley.com | blocked-cookie | 2531
+ onlinelibrary.wiley.com | redirect-loop | 264
+ open.library.ubc.ca | | 569
+ open.library.ubc.ca | no-pdf-link | 425 *
+ opendata.uni-halle.de | | 407
+ opendata.uni-halle.de | success | 263
+ osf.io | | 49022
+ osf.io | gateway-timeout | 29810
+ osf.io | terminal-bad-status | 18731
+ osf.io | spn2-error | 247
+ osf.io | not-found | 205
+ oxford.universitypressscholarship.com | | 392
+ oxford.universitypressscholarship.com | link-loop | 233
+ panor.ru | no-pdf-link | 433 *
+ panor.ru | | 433
+ papers.ssrn.com | | 1630
+ papers.ssrn.com | link-loop | 1598
+ pdf.sciencedirectassets.com | | 3063
+ pdf.sciencedirectassets.com | success | 3063
+ peerj.com | | 464
+ peerj.com | no-pdf-link | 303 *
+ periodicos.ufpe.br | | 245
+ periodicos.ufpe.br | success | 232
+ periodicos.unb.br | | 230
+ periodicos.unb.br | success | 221
+ preprints.jmir.org | | 548
+ preprints.jmir.org | cdx-error | 499
+ publications.rwth-aachen.de | | 213
+ publikationen.bibliothek.kit.edu | | 346
+ publikationen.bibliothek.kit.edu | success | 314
+ publikationen.uni-tuebingen.de | | 623
+ publikationen.uni-tuebingen.de | no-pdf-link | 522 *
+ publons.com | no-pdf-link | 934 *
+ publons.com | | 934
+ pubs.acs.org | | 4507
+ pubs.acs.org | blocked-cookie | 4406
+ pubs.rsc.org | | 1638
+ pubs.rsc.org | link-loop | 1054
+ pubs.rsc.org | redirect-loop | 343
+ pubs.rsc.org | success | 201
+ repositorio.ufu.br | | 637
+ repositorio.ufu.br | success | 607
+ repository.dri.ie | | 1852
+ repository.dri.ie | no-pdf-link | 1852 **
+ repository.library.brown.edu | | 293
+ repository.library.brown.edu | no-pdf-link | 291 *
+ res.mdpi.com | | 10367
+ res.mdpi.com | success | 10360
+ retrovirology.biomedcentral.com | | 230
+ revistas.ufrj.br | | 284
+ revistas.ufrj.br | success | 283
+ revistas.uptc.edu.co | | 385
+ revistas.uptc.edu.co | success | 344
+ royalsocietypublishing.org | | 231
+ rsdjournal.org | | 347
+ rsdjournal.org | success | 343
+ s3-ap-southeast-2.amazonaws.com | | 400
+ s3-ap-southeast-2.amazonaws.com | success | 392
+ s3-eu-west-1.amazonaws.com | | 2096
+ s3-eu-west-1.amazonaws.com | success | 2091
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 289
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 286
+ s3.ca-central-1.amazonaws.com | | 202
+ sage.figshare.com | | 242
+ sage.figshare.com | no-pdf-link | 241
+ sajeb.org | | 246
+ sajeb.org | no-pdf-link | 243
+ scholar.dkyobobook.co.kr | | 332
+ scholar.dkyobobook.co.kr | no-pdf-link | 328 *
+ search.mandumah.com | | 735
+ search.mandumah.com | redirect-loop | 726
+ secure.jbs.elsevierhealth.com | | 1112
+ secure.jbs.elsevierhealth.com | blocked-cookie | 1108
+ stm.bookpi.org | no-pdf-link | 468 *
+ stm.bookpi.org | | 468
+ storage.googleapis.com | | 1012
+ storage.googleapis.com | success | 1012
+ tandf.figshare.com | | 469
+ tandf.figshare.com | no-pdf-link | 466
+ teses.usp.br | | 739
+ teses.usp.br | success | 730
+ tidsskrift.dk | | 360
+ tidsskrift.dk | success | 346
+ tiedejaedistys.journal.fi | | 224
+ tind-customer-agecon.s3.amazonaws.com | success | 332
+ tind-customer-agecon.s3.amazonaws.com | | 332
+ valep.vc.univie.ac.at | no-pdf-link | 280
+ valep.vc.univie.ac.at | | 280
+ watermark.silverchair.com | | 1729
+ watermark.silverchair.com | success | 1719
+ www.academia.edu | | 387
+ www.academia.edu | no-pdf-link | 386
+ www.ahajournals.org | | 430
+ www.ahajournals.org | blocked-cookie | 413
+ www.atenaeditora.com.br | | 572
+ www.atenaeditora.com.br | terminal-bad-status | 513
+ www.atlantis-press.com | success | 722
+ www.atlantis-press.com | | 722
+ www.aup-online.com | | 419
+ www.aup-online.com | no-pdf-link | 419 *
+ www.beck-elibrary.de | | 269
+ www.beck-elibrary.de | no-pdf-link | 268 *
+ www.biodiversitylibrary.org | no-pdf-link | 528 *
+ www.biodiversitylibrary.org | | 528
+ www.bloomsburycollections.com | | 623
+ www.bloomsburycollections.com | no-pdf-link | 605 *
+ www.cabi.org | | 2191
+ www.cabi.org | no-pdf-link | 2186 *
+ www.cairn.info | | 1283
+ www.cairn.info | no-pdf-link | 713
+ www.cairn.info | link-loop | 345
+ www.cambridge.org | | 4128
+ www.cambridge.org | no-pdf-link | 1531
+ www.cambridge.org | success | 1441
+ www.cambridge.org | link-loop | 971
+ www.cureus.com | no-pdf-link | 526 *
+ www.cureus.com | | 526
+ www.dbpia.co.kr | | 637
+ www.dbpia.co.kr | redirect-loop | 631
+ www.deboni.he.com.br | | 382
+ www.deboni.he.com.br | success | 381
+ www.degruyter.com | | 17783
+ www.degruyter.com | no-pdf-link | 15102
+ www.degruyter.com | success | 2584
+ www.dovepress.com | | 480
+ www.dovepress.com | success | 472
+ www.e-manuscripta.ch | | 1350
+ www.e-manuscripta.ch | no-pdf-link | 1350 *
+ www.e-periodica.ch | | 1276
+ www.e-periodica.ch | no-pdf-link | 1275
+ www.e-rara.ch | | 202
+ www.e-rara.ch | no-pdf-link | 202
+ www.elgaronline.com | | 495
+ www.elgaronline.com | link-loop | 290
+ www.elibrary.ru | | 922
+ www.elibrary.ru | no-pdf-link | 904
+ www.emerald.com | | 2155
+ www.emerald.com | no-pdf-link | 1936 *
+ www.emerald.com | success | 219
+ www.eurekaselect.com | | 518
+ www.eurekaselect.com | no-pdf-link | 516 *
+ www.frontiersin.org | | 4163
+ www.frontiersin.org | no-pdf-link | 4162 **
+ www.hanser-elibrary.com | | 444
+ www.hanser-elibrary.com | blocked-cookie | 444
+ www.hanspub.org | | 334
+ www.hanspub.org | no-pdf-link | 314
+ www.idunn.no | | 1736
+ www.idunn.no | link-loop | 596
+ www.idunn.no | success | 577
+ www.idunn.no | no-pdf-link | 539
+ www.igi-global.com | terminal-bad-status | 458
+ www.igi-global.com | | 458
+ www.ijcai.org | | 533
+ www.ijcai.org | success | 532
+ www.ijraset.com | success | 385
+ www.ijraset.com | | 385
+ www.inderscience.com | | 712
+ www.inderscience.com | no-pdf-link | 605 *
+ www.ingentaconnect.com | | 456
+ www.ingentaconnect.com | no-pdf-link | 413 *
+ www.internationaljournalssrg.org | | 305
+ www.internationaljournalssrg.org | no-pdf-link | 305 *
+ www.isca-speech.org | | 2392
+ www.isca-speech.org | no-pdf-link | 2391 **
+ www.journals.uchicago.edu | | 228
+ www.journals.uchicago.edu | blocked-cookie | 227
+ www.jstage.jst.go.jp | | 1492
+ www.jstage.jst.go.jp | success | 1185
+ www.jstage.jst.go.jp | no-pdf-link | 289
+ www.jstor.org | | 301
+ www.jurology.com | | 887
+ www.jurology.com | redirect-loop | 887
+ www.karger.com | | 318
+ www.liebertpub.com | | 507
+ www.liebertpub.com | blocked-cookie | 496
+ www.morressier.com | | 4781
+ www.morressier.com | no-pdf-link | 4655 **
+ www.ncl.ecu.edu | | 413
+ www.ncl.ecu.edu | success | 413
+ www.nomos-elibrary.de | | 526
+ www.nomos-elibrary.de | no-pdf-link | 391
+ www.oecd-ilibrary.org | no-pdf-link | 1170 **
+ www.oecd-ilibrary.org | | 1170
+ www.openagrar.de | no-pdf-link | 221
+ www.openagrar.de | | 221
+ www.osapublishing.org | | 900
+ www.osapublishing.org | link-loop | 615
+ www.osapublishing.org | no-pdf-link | 269
+ www.osti.gov | | 630
+ www.osti.gov | link-loop | 573
+ www.oxfordlawtrove.com | no-pdf-link | 476 *
+ www.oxfordlawtrove.com | | 476
+ www.pdcnet.org | | 298
+ www.pdcnet.org | terminal-bad-status | 262
+ www.pedocs.de | | 203
+ www.pnas.org | | 222
+ www.preprints.org | | 372
+ www.preprints.org | success | 366
+ www.repository.cam.ac.uk | | 801
+ www.repository.cam.ac.uk | success | 359
+ www.repository.cam.ac.uk | no-pdf-link | 239
+ www.research-collection.ethz.ch | | 276
+ www.research-collection.ethz.ch | terminal-bad-status | 274
+ www.revistas.usp.br | | 207
+ www.revistas.usp.br | success | 204
+ www.rina.org.uk | no-pdf-link | 1009 **
+ www.rina.org.uk | | 1009
+ www.schweizerbart.de | no-pdf-link | 202
+ www.schweizerbart.de | | 202
+ www.scielo.br | | 544
+ www.scielo.br | redirect-loop | 526
+ www.sciencedirect.com | | 3901
+ www.sciencedirect.com | no-pdf-link | 3127 **
+ www.sciencedirect.com | link-loop | 701
+ www.sciendo.com | | 384
+ www.sciendo.com | success | 363
+ www.sciengine.com | | 225
+ www.scirp.org | | 209
+ www.spandidos-publications.com | | 205
+ www.tandfonline.com | | 8925
+ www.tandfonline.com | blocked-cookie | 8099
+ www.tandfonline.com | terminal-bad-status | 477
+ www.tandfonline.com | redirect-loop | 322
+ www.taylorfrancis.com | | 6119
+ www.taylorfrancis.com | no-pdf-link | 3567
+ www.taylorfrancis.com | link-loop | 2169
+ www.taylorfrancis.com | terminal-bad-status | 353
+ www.thieme-connect.de | | 1047
+ www.thieme-connect.de | redirect-loop | 472
+ www.thieme-connect.de | spn2-error:job-failed | 343
+ www.tib.eu | | 206
+ www.trp.org.in | | 311
+ www.trp.org.in | success | 311
+ www.un-ilibrary.org | no-pdf-link | 597 *
+ www.un-ilibrary.org | | 597
+ www.vr-elibrary.de | | 775
+ www.vr-elibrary.de | blocked-cookie | 774
+ www.wjgnet.com | | 204
+ www.wjgnet.com | no-pdf-link | 204
+ www.worldscientific.com | | 974
+ www.worldscientific.com | blocked-cookie | 971
+ www.worldwidejournals.com | | 242
+ www.worldwidejournals.com | no-pdf-link | 203
+ www.wto-ilibrary.org | no-pdf-link | 295
+ www.wto-ilibrary.org | | 295
+ www.zora.uzh.ch | | 222
+ zenodo.org | | 49460
+ zenodo.org | no-pdf-link | 39721
+ zenodo.org | success | 8954
+ zenodo.org | wrong-mimetype | 562
+ | | 445919
+ | no-pdf-link | 168035
+ | success | 140875
+ | gateway-timeout | 31809
+ | blocked-cookie | 26431
+ | terminal-bad-status | 25625
+ | link-loop | 19006
+ | spn2-error:job-failed | 13962
+ | redirect-loop | 12512
+ | wrong-mimetype | 2302
+ | spn2-error | 1689
+ | too-many-redirects | 1203
+ | bad-redirect | 732
+ | cdx-error | 539
+ | not-found | 420
+ | spn2-error:no-status | 256
+ (419 rows)
+
+Get random subsets by terminal domain:
+
+ \x auto
+ SELECT
+ ingest_request.link_source_id AS link_source_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-changelog'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_file_result.terminal_url LIKE '%//DOMAIN/%'
+ ORDER BY random()
+ LIMIT 5;
+
+## acervus.unicamp.br
+
+Previously flagged as messy (2021-05_daily_improvements.md)
+
+## cas.columbia.edu
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-2ety-qm51
+base_url | https://doi.org/10.7916/d8-2ety-qm51
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-0zf6-d167
+base_url | https://doi.org/10.7916/d8-0zf6-d167
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-k6ha-sn43
+base_url | https://doi.org/10.7916/d8-k6ha-sn43
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-bj6t-eb07
+base_url | https://doi.org/10.7916/d8-bj6t-eb07
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7916/d8-xjac-j502
+base_url | https://doi.org/10.7916/d8-xjac-j502
+terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback
+
+these are not public (loginwalls)
+
+DONE: '/login?TARGET=' as a login wall pattern
+
+## doi.ala.org.au
+
+Previously flagged as dataset repository; datacite metadata is wrong. (2021-05_daily_improvements.md)
+
+NOTE: look at ingesting datasets
+
+## www.isca-speech.org
+
+-[ RECORD 1 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2014-84
+base_url | https://doi.org/10.21437/interspeech.2014-84
+terminal_url | https://www.isca-speech.org/archive/interspeech_2014/li14b_interspeech.html
+-[ RECORD 2 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2004-319
+base_url | https://doi.org/10.21437/interspeech.2004-319
+terminal_url | https://www.isca-speech.org/archive/interspeech_2004/delcroix04_interspeech.html
+-[ RECORD 3 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-372
+base_url | https://doi.org/10.21437/interspeech.2006-372
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/lei06c_interspeech.html
+-[ RECORD 4 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2015-588
+base_url | https://doi.org/10.21437/interspeech.2015-588
+terminal_url | https://www.isca-speech.org/archive/interspeech_2015/polzehl15b_interspeech.html
+-[ RECORD 5 ]--+----------------------------------------------------------------------------------
+link_source_id | 10.21437/interspeech.2006-468
+base_url | https://doi.org/10.21437/interspeech.2006-468
+terminal_url | https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html
+
+Bespoke site. Added rule to sandcrawler.
+
+NOTE: re-ingest/recrawl all isca-speech.org no-pdf-link terminal URLs (fatcat-ingest?)
+
+## www.morressier.com
+
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0002858v
+base_url | https://doi.org/10.1115/1.0002858v
+terminal_url | https://www.morressier.com/article/development-new-single-highdensity-heatflux-gauges-unsteady-heat-transfer-measurements-rotating-transonic-turbine/60f162805d86378f03b49af5
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0003896v
+base_url | https://doi.org/10.1115/1.0003896v
+terminal_url | https://www.morressier.com/article/experimental-investigation-proton-exchange-membrane-fuel-cell-platinum-nafion-along-inplane-direction/60f16d555d86378f03b50038
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0004476v
+base_url | https://doi.org/10.1115/1.0004476v
+terminal_url | https://www.morressier.com/article/effect-air-release-agents-performance-results-fabric-lined-bushings/60f16d585d86378f03b502d5
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0001286v
+base_url | https://doi.org/10.1115/1.0001286v
+terminal_url | https://www.morressier.com/article/development-verification-modelling-practice-cfd-calculations-obtain-current-loads-fpso/60f15d3fe537565438d70ece
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1115/1.0000315v
+base_url | https://doi.org/10.1115/1.0000315v
+terminal_url | https://www.morressier.com/article/fire-event-analysis-fire-frequency-estimation-japanese-nuclear-power-plant/60f15a6f5d86378f03b43874
+
+Many of these seem to be presentations, as both video and slides. PDFs seem broken though.
+
+NOTE: add to list of interesting rich media to crawl/preserve (video+slides+data)
+
+## www.oecd-ilibrary.org
+
+Paywall (2021-05_daily_improvements.md)
+
+## www.rina.org.uk
+
+-[ RECORD 1 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.ws.2002.10
+base_url | https://doi.org/10.3940/rina.ws.2002.10
+terminal_url | https://www.rina.org.uk/showproducts.html?product=4116
+-[ RECORD 2 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.pass.2003.16
+base_url | https://doi.org/10.3940/rina.pass.2003.16
+terminal_url | https://www.rina.org.uk/showproducts.html?product=3566
+-[ RECORD 3 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin.2013.15
+base_url | https://doi.org/10.3940/rina.icsotin.2013.15
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8017
+-[ RECORD 4 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.wfa.2010.23
+base_url | https://doi.org/10.3940/rina.wfa.2010.23
+terminal_url | https://www.rina.org.uk/showproducts.html?product=8177
+-[ RECORD 5 ]--+-------------------------------------------------------
+link_source_id | 10.3940/rina.icsotin15.2015.01
+base_url | https://doi.org/10.3940/rina.icsotin15.2015.01
+terminal_url | https://www.rina.org.uk/showproducts.html?product=7883
+
+Site is broken in some way
+
+## www.sciencedirect.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.jhlste.2021.100332
+base_url | https://doi.org/10.1016/j.jhlste.2021.100332
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S1473837621000332
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.hazadv.2021.100006
+base_url | https://doi.org/10.1016/j.hazadv.2021.100006
+terminal_url | https://www.sciencedirect.com/science/article/pii/S2772416621000061/pdfft?md5=e51bfd495bb53073c7a379d25cb11a32&pid=1-s2.0-S2772416621000061-main.pdf
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-12-822844-9.00009-8
+base_url | https://doi.org/10.1016/b978-0-12-822844-9.00009-8
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780128228449000098
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/j.colcom.2021.100490
+base_url | https://doi.org/10.1016/j.colcom.2021.100490
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S2215038221001308
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1016/b978-0-323-85245-6.00012-6
+base_url | https://doi.org/10.1016/b978-0-323-85245-6.00012-6
+terminal_url | https://www.sciencedirect.com/science/article/pii/B9780323852456000126
+
+These no-pdf-url ones seem to just be not OA, which is expected for much of the
+domain.
+
+## repository.dri.ie
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+---------------------------------------------
+ 10.7486/dri.t148v5941 | https://doi.org/10.7486/dri.t148v5941 | https://repository.dri.ie/catalog/t148v5941
+ 10.7486/dri.2z119c98f | https://doi.org/10.7486/dri.2z119c98f | https://repository.dri.ie/catalog/2z119c98f
+ 10.7486/dri.qf8621102 | https://doi.org/10.7486/dri.qf8621102 | https://repository.dri.ie/catalog/qf8621102
+ 10.7486/dri.js95m457t | https://doi.org/10.7486/dri.js95m457t | https://repository.dri.ie/catalog/js95m457t
+ 10.7486/dri.c534vb726 | https://doi.org/10.7486/dri.c534vb726 | https://repository.dri.ie/catalog/c534vb726
+
+"Digital repository of Ireland"
+
+Historical scanned content. Bespoke site. Fixed.
+
+NOTE: recrawl/retry this domain
+
+## www.frontiersin.org
+
+-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/978-2-88971-147-5
+base_url | https://doi.org/10.3389/978-2-88971-147-5
+terminal_url | https://www.frontiersin.org/research-topics/9081/neuroimaging-approaches-to-the-study-of-tinnitus-and-hyperacusis
+-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fnins.2021.722592
+base_url | https://doi.org/10.3389/fnins.2021.722592
+terminal_url | https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full
+-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fcell.2021.683209
+base_url | https://doi.org/10.3389/fcell.2021.683209
+terminal_url | https://www.frontiersin.org/articles/10.3389/fcell.2021.683209/full
+-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fmicb.2021.692474
+base_url | https://doi.org/10.3389/fmicb.2021.692474
+terminal_url | https://www.frontiersin.org/articles/10.3389/fmicb.2021.692474/full
+-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3389/fneur.2021.676527
+base_url | https://doi.org/10.3389/fneur.2021.676527
+terminal_url | https://www.frontiersin.org/articles/10.3389/fneur.2021.676527/full
+
+All the `/research-topics/` URLs are out of scope.
+
+NOTE: recrawl missing frontiersin.org articles for PDFs
+NOTE: recrawl missing frontiersin.org articles for XML (?)
+
+-------
+
+## direct.mit.edu
+
+Previously "not available" (2021-05_daily_improvements.md)
+
+## figshare.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15052236.v6
+base_url | https://doi.org/10.6084/m9.figshare.15052236.v6
+terminal_url | https://figshare.com/articles/software/RCL-tree_rar/15052236/6
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.14907846.v5
+base_url | https://doi.org/10.6084/m9.figshare.14907846.v5
+terminal_url | https://figshare.com/articles/book/Conservation_of_Limestone_Ecosystems_of_Malaysia_Part_I_Acknowledgements_Methodology_Overview_of_limestone_outcrops_in_Malaysia_References_Detailed_information_on_limestone_outcrops_of_the_states_Johor_Negeri_Sembilan_Terengganu_Selangor_Pe/14907846/5
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15157614.v1
+base_url | https://doi.org/10.6084/m9.figshare.15157614.v1
+terminal_url | https://figshare.com/articles/software/code_for_NN-A72265C/15157614/1
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.15172926.v1
+base_url | https://doi.org/10.6084/m9.figshare.15172926.v1
+terminal_url | https://figshare.com/articles/preprint/History_of_the_internet/15172926/1
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.6084/m9.figshare.16532574.v1
+base_url | https://doi.org/10.6084/m9.figshare.16532574.v1
+terminal_url | https://figshare.com/articles/media/Helen_McConnell_How_many_trees_do_you_think_you_have_planted_/16532574/1
+
+NOTE: can determine from the redirect URL, I guess. This is helpful for ingest!
+Could also potentially correct fatcat release_type using this info.
+
+We seem to be getting the ones we can (eg, papers) just fine
+
+## hkvalidate.perfdrive.com
+
+Should be skipping/bailing on this domain, but not for some reason.
+
+-[ RECORD 1 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05cc
+base_url | https://doi.org/10.3847/1538-4357/ac05cc
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=1716a049-aeaa-4a89-8f82-bd733adaa2e7&ssb=43981203877&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05cc&ssi=0774dd12-8427-4e27-a2ac-759c8cc2ec0e&ssk=support@shieldsquare.com&ssm=07370915269044035109047683305266&ssn=e69c743cc3d66619f960f924b562160d637e8d7f1b0f-d3bb-44d4-b075ed&sso=75a8bd85-4a097fb40f99bfb9c97b0a4ca0a38fd6d79513a466e82cc7&ssp=92054607321628531005162856888275586&ssq=33809984098158010864140981653938424553916&ssr=MjA3LjI0MS4yMjUuMTM5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 2 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac0429
+base_url | https://doi.org/10.3847/1538-4357/ac0429
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=12bca70d-0af4-4241-9c9b-384befd96a88&ssb=92559232428&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac0429&ssi=cff72ab0-8427-4acd-a0e7-db1b04cf7ce7&ssk=support@shieldsquare.com&ssm=27895673282814430105287068829605&ssn=9af36a8e10efd239c9367a2f31dde500f7455c4d5f45-bf11-4b99-ad29ea&sso=26bd22d2-b23e1bd9558f2fd9ed0768ef1acecb24715d1d463328a229&ssp=16502500621628222613162823304820671&ssq=11469693950387070477339503456478590533604&ssr=MjA3LjI0MS4yMjUuMTYw&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 3 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.1149/1945-7111/ac1a85
+base_url | https://doi.org/10.1149/1945-7111/ac1a85
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=b0fef51a-0f44-476e-b951-3341bde6aa67&ssb=84929220393&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.1149%2F1945-7111%2Fac1a85&ssi=48c05577-8427-4421-acd3-735ca29a46e6&ssk=support@shieldsquare.com&ssm=81129482524077974103852241068134&ssn=cf6c261d2b20d518b2ebe57e40ffaec9ab4cd1955dcb-7877-4f5b-bc3b1e&sso=1d196cae-6850f1ed8143e460f2bfbb61a8ae15cfe6b53d3bcdc528ca&ssp=99289867941628195224162819241830491&ssq=16897595632212421273956322948987630170313&ssr=MjA3LjI0MS4yMjUuMjM2&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 4 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.35848/1882-0786/ac1b0d
+base_url | https://doi.org/10.35848/1882-0786/ac1b0d
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=6debdd23-c46b-4b40-b73c-d5540f04454e&ssb=95627212532&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.35848%2F1882-0786%2Fac1b0d&ssi=78b34ff9-8427-4d07-a0db-78a3aa2c7332&ssk=support@shieldsquare.com&ssm=54055111549093989106852695053789&ssn=cb51949e15a02cb99a8d0b57c4d06327b72e8d5c87a8-d006-4ffa-939ffb&sso=1b7fd62d-8107746fe28fca252fd45ffa403937e272bf75b452b68d4a&ssp=77377533171628212164162820021422494&ssq=02679025218797637682252187852000657274192&ssr=MjA3LjI0MS4yMzMuMTIx&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+-[ RECORD 5 ]--+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.3847/1538-4357/ac05ba
+base_url | https://doi.org/10.3847/1538-4357/ac05ba
+terminal_url | https://hkvalidate.perfdrive.com/?ssa=f127eb3d-6a05-459d-97f2-499715c04b13&ssb=06802230353&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05ba&ssi=8d087719-8427-4046-91fb-5e96af401560&ssk=support@shieldsquare.com&ssm=21056861072205974105064006574997&ssn=d05a73cff6d9af57acd6e2c366e716176752e1164d39-b9a7-408c-837d11&sso=d3f38d1e-a562a19195042d7e471a5e4fab03b6ca16ff1711c7c61804&ssp=68781137401628744693162877909483738&ssq=79454859841502433261398415426689546750534&ssr=MjA3LjI0MS4yMzIuMTg5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw=
+
+Was failing to check against blocklist again at the end of attempts.
+
+Could retry all these to update status, but probably not worth it.
+
+## jov.arvojournals.org
+
+ link_source_id | base_url | terminal_url
+-----------------------+---------------------------------------+-------------------------------------------------------------
+ 10.1167/jov.21.9.1933 | https://doi.org/10.1167/jov.21.9.1933 | https://jov.arvojournals.org/article.aspx?articleid=2777021
+ 10.1167/jov.21.9.2910 | https://doi.org/10.1167/jov.21.9.2910 | https://jov.arvojournals.org/article.aspx?articleid=2777561
+ 10.1167/jov.21.9.1895 | https://doi.org/10.1167/jov.21.9.1895 | https://jov.arvojournals.org/article.aspx?articleid=2777057
+ 10.1167/jov.21.9.2662 | https://doi.org/10.1167/jov.21.9.2662 | https://jov.arvojournals.org/article.aspx?articleid=2777793
+ 10.1167/jov.21.9.2246 | https://doi.org/10.1167/jov.21.9.2246 | https://jov.arvojournals.org/article.aspx?articleid=2777441
+
+These seem to just not be published/available yet.
+
+But they also use watermark.silverchair.com
+
+NOTE: re-crawl (force-retry?) all non-recent papers with fatcat-ingest
+NOTE: for watermark.silverchair.com terminal bad-status, re-crawl from initial URL (base_url) using heritrix
+
+## kiss.kstudy.com
+
+Previously unable to download (2021-05_daily_improvements.md)
+
+## open.library.ubc.ca
+
+ link_source_id | base_url | terminal_url
+--------------------+------------------------------------+----------------------------------------------------------------------------------
+ 10.14288/1.0400664 | https://doi.org/10.14288/1.0400664 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400664
+ 10.14288/1.0401189 | https://doi.org/10.14288/1.0401189 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401189
+ 10.14288/1.0401487 | https://doi.org/10.14288/1.0401487 | https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+ 10.14288/1.0400994 | https://doi.org/10.14288/1.0400994 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400994
+ 10.14288/1.0401312 | https://doi.org/10.14288/1.0401312 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401312
+
+Historical newspapers, out of scope?
+
+Video content:
+https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487
+
+Another video: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+
+NOTE: add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+NOTE: handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+
+
+## panor.ru
+
+ link_source_id | base_url | terminal_url
+-------------------------+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 10.33920/med-14-2108-06 | https://doi.org/10.33920/med-14-2108-06 | https://panor.ru/articles/otsenka-dinamiki-pokazateley-morfofunktsionalnykh-kharakteristik-kozhi-upatsientov-s-spr-pod-vliyaniem-kompleksnoy-fototerapii/66351.html
+ 10.33920/nik-02-2105-01 | https://doi.org/10.33920/nik-02-2105-01 | https://panor.ru/articles/innovatsionnost-obrazovatelnykh-tekhnologiy-kak-istoricheski-oposredovannyy-fenomen/65995.html
+ 10.33920/pro-1-2101-10 | https://doi.org/10.33920/pro-1-2101-10 | https://panor.ru/articles/obespechenie-bezopasnosti-na-promyshlennykh-predpriyatiyakh-s-pomoshchyu-sredstv-individualnoy-zashchity/66299.html
+ 10.33920/sel-4-2008-04 | https://doi.org/10.33920/sel-4-2008-04 | https://panor.ru/articles/osobennosti-regulirovaniya-zemelnykh-otnosheniy-na-prigranichnykh-territoriyakh-rossiyskoy-federatsii/66541.html
+ 10.33920/pro-2-2104-03 | https://doi.org/10.33920/pro-2-2104-03 | https://panor.ru/articles/organizatsiya-samorazvivayushchegosya-proizvodstva-v-realnykh-usloviyakh/65054.html
+
+"The full version of the article is available only to subscribers of the journal"
+
+Paywall
+
+## peerj.com
+
+Previously: this is HTML of reviews (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope
+
+## publons.com
+
+Previously: this is HTML (2021-05_daily_improvements.md)
+
+NOTE: Should be HTML ingest, possibly special case scope (length of works)
+
+## stm.bookpi.org
+
+ link_source_id | base_url | terminal_url
+-----------------------------+---------------------------------------------+----------------------------------------------------
+ 10.9734/bpi/nfmmr/v7/11547d | https://doi.org/10.9734/bpi/nfmmr/v7/11547d | https://stm.bookpi.org/NFMMR-V7/article/view/3231
+ 10.9734/bpi/ecafs/v1/9773d | https://doi.org/10.9734/bpi/ecafs/v1/9773d | https://stm.bookpi.org/ECAFS-V1/article/view/3096
+ 10.9734/bpi/mpebm/v5/3391f | https://doi.org/10.9734/bpi/mpebm/v5/3391f | https://stm.bookpi.org/MPEBM-V5/article/view/3330
+ 10.9734/bpi/castr/v13/3282f | https://doi.org/10.9734/bpi/castr/v13/3282f | https://stm.bookpi.org/CASTR-V13/article/view/2810
+ 10.9734/bpi/hmms/v13 | https://doi.org/10.9734/bpi/hmms/v13 | https://stm.bookpi.org/HMMS-V13/issue/view/274
+
+These are... just abstracts of articles within a book? Weird. Maybe sketchy? DOIs via Crossref
+
+## www.cabi.org
+
+ link_source_id | base_url | terminal_url
+--------------------------+------------------------------------------+----------------------------------------------------
+ 10.1079/dfb/20133414742 | https://doi.org/10.1079/dfb/20133414742 | https://www.cabi.org/cabreviews/review/20133414742
+ 10.1079/dmpd/20056500471 | https://doi.org/10.1079/dmpd/20056500471 | https://www.cabi.org/cabreviews/review/20056500471
+ 10.1079/dmpp/20056600544 | https://doi.org/10.1079/dmpp/20056600544 | https://www.cabi.org/cabreviews/review/20056600544
+ 10.1079/dmpd/20056500117 | https://doi.org/10.1079/dmpd/20056500117 | https://www.cabi.org/cabreviews/review/20056500117
+ 10.1079/dmpp20056600337 | https://doi.org/10.1079/dmpp20056600337 | https://www.cabi.org/cabreviews/review/20056600337
+
+Reviews? but just abstracts?
+
+## www.cureus.com
+
+-[ RECORD 1 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17547
+base_url | https://doi.org/10.7759/cureus.17547
+terminal_url | https://www.cureus.com/articles/69542-tramadol-induced-jerks
+-[ RECORD 2 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16867
+base_url | https://doi.org/10.7759/cureus.16867
+terminal_url | https://www.cureus.com/articles/66793-advanced-squamous-cell-carcinoma-of-gall-bladder-masquerading-as-liver-abscess-with-review-of-literature-review-on-advanced-biliary-tract-cancer
+-[ RECORD 3 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17425
+base_url | https://doi.org/10.7759/cureus.17425
+terminal_url | https://www.cureus.com/articles/67438-attitudes-and-knowledge-of-medical-students-towards-healthcare-for-lesbian-gay-bisexual-and-transgender-seniors-impact-of-a-case-based-discussion-with-facilitators-from-the-community
+-[ RECORD 4 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.17313
+base_url | https://doi.org/10.7759/cureus.17313
+terminal_url | https://www.cureus.com/articles/67258-utilizing-google-trends-to-track-online-interest-in-elective-hand-surgery-during-the-covid-19-pandemic
+-[ RECORD 5 ]--+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+link_source_id | 10.7759/cureus.16943
+base_url | https://doi.org/10.7759/cureus.16943
+terminal_url | https://www.cureus.com/articles/19364-small-bowel-obstruction-a-rare-presentation-of-the-inferior-pancreaticoduodenal-artery-pseudoaneurysm-bleed
+
+Ugh, stupid "email to get PDF". but ingest seems to work anyways?
+
+NOTE: re-crawl/re-ingest all (eg, fatcat-ingest or similar)
+
+## www.e-manuscripta.ch
+
+ link_source_id | base_url | terminal_url
+------------------------------+----------------------------------------------+-------------------------------------------------------------------
+ 10.7891/e-manuscripta-114031 | https://doi.org/10.7891/e-manuscripta-114031 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114031
+ 10.7891/e-manuscripta-112064 | https://doi.org/10.7891/e-manuscripta-112064 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112064
+ 10.7891/e-manuscripta-112176 | https://doi.org/10.7891/e-manuscripta-112176 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176
+ 10.7891/e-manuscripta-115200 | https://doi.org/10.7891/e-manuscripta-115200 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-115200
+ 10.7891/e-manuscripta-114008 | https://doi.org/10.7891/e-manuscripta-114008 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114008
+
+Historical docs, single pages, but do have full PDF downloads.
+
+NOTE: re-ingest
+
+## www.inderscience.com
+
+Previously: paywall (2021-05_daily_improvements.md)
+
+## www.un-ilibrary.org
+
+ link_source_id | base_url | terminal_url
+----------------------------+--------------------------------------------+-------------------------------------------------------------
+ 10.18356/9789210550307 | https://doi.org/10.18356/9789210550307 | https://www.un-ilibrary.org/content/books/9789210550307
+ 10.18356/9789210586719c011 | https://doi.org/10.18356/9789210586719c011 | https://www.un-ilibrary.org/content/books/9789210586719c011
+ 10.18356/9789210058575c014 | https://doi.org/10.18356/9789210058575c014 | https://www.un-ilibrary.org/content/books/9789210058575c014
+ 10.18356/9789210550307c020 | https://doi.org/10.18356/9789210550307c020 | https://www.un-ilibrary.org/content/books/9789210550307c020
+ 10.18356/9789213631423c005 | https://doi.org/10.18356/9789213631423c005 | https://www.un-ilibrary.org/content/books/9789213631423c005
+
+Books and chapters. Doesn't seem to have actual download ability?
+
+# Re-Ingest / Re-Crawl
+
+Using fatcat-ingest helper tool.
+
+- www.isca-speech.org doi_prefix:10.21437
+ doi:* doi_prefix:10.21437 in_ia:false
+ 9,233
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.21437' > /srv/fatcat/tasks/2021-09-03_ingest_isca.json
+ => Counter({'ingest_request': 9221, 'elasticsearch_release': 9221, 'estimate': 9221})
+- repository.dri.ie doi_prefix:10.7486
+ doi:* in_ia:false doi_prefix:10.7486
+ 56,532
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.7486' > /srv/fatcat/tasks/2021-09-03_ingest_dri.json
+ => Counter({'ingest_request': 56532, 'elasticsearch_release': 56532, 'estimate': 56532})
+- *.arvojournals.org doi_prefix:10.1167 (force recrawl if no-pdf-link)
+ 25,598
+ many are meeting abstracts
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.1167 > /srv/fatcat/tasks/2021-09-03_ingest_arvo.json
+ => Counter({'ingest_request': 25598, 'elasticsearch_release': 25598, 'estimate': 25598})
+- www.cureus.com doi_prefix:10.7759
+ 1,537
+ ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.7759 > /srv/fatcat/tasks/2021-09-03_ingest_cureus.json
+ => Counter({'ingest_request': 1535, 'elasticsearch_release': 1535, 'estimate': 1535})
+- www.e-manuscripta.ch doi_prefix:10.7891 10.7891/e-manuscripta
+ 110,945
+ TODO: all are marked 'unpublished', but that is actually probably right?
+- www.frontiersin.org doi_prefix:10.3389 (both PDF and XML!)
+ doi:* in_ia:false doi_prefix:10.3389
+ 212,370
+ doi:10.3389/conf.* => most seem to be just abstracts? how many like this?
+ container_id:kecnf6vtpngn7j2avgfpdyw5ym => "topics" (2.2k)
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 191k
+ but many might be components? this is actually kind of a mess
+ fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz
+ => 19.2k
+ ./fatcat_ingest.py --allow-non-oa query 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' | rg -v 10.3389/conf > /srv/fatcat/tasks/2021-09-03_frontiers.json
+
+# Remaining Tasks / Domains (TODO)
+
+more complex crawling/content:
+- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764
+- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url
+- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data)
+- doi.ala.org.au: possible dataset ingest source
+- peerj.com, at least reviews, should be HTML ingest? or are some PDF?
+- publons.com should be HTML ingest, possibly special case for scope
+- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug
+
+other tasks:
+- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512
+- push/deploy sandcrawler changes