aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-09-03 18:34:33 -0700
committerBryan Newbold <bnewbold@archive.org>2021-09-03 18:34:33 -0700
commitd749a7a6a1c1d439596c5d053daf904638b4dbc2 (patch)
treefd5ed9dd6dfd688908554ed84fb8caccf426e156
parentc147ad35fa8ec59b8c015f8badae67c525f65253 (diff)
downloadsandcrawler-d749a7a6a1c1d439596c5d053daf904638b4dbc2.tar.gz
sandcrawler-d749a7a6a1c1d439596c5d053daf904638b4dbc2.zip
OAI-PMH patch and ingest improvement notes
-rw-r--r--notes/ingest/2021-08_oai_pmh_patch.md204
-rw-r--r--notes/ingest/2021-09-02_oai_pmh_patch.md1578
2 files changed, 1578 insertions, 204 deletions
diff --git a/notes/ingest/2021-08_oai_pmh_patch.md b/notes/ingest/2021-08_oai_pmh_patch.md
deleted file mode 100644
index 20bb451..0000000
--- a/notes/ingest/2021-08_oai_pmh_patch.md
+++ /dev/null
@@ -1,204 +0,0 @@
-
-Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially
-re-crawling content which failed to ingest the first time.
-
-## Basic Counts
-
- SELECT ingest_file_result.status, COUNT(*)
- FROM ingest_request
- LEFT JOIN ingest_file_result
- ON ingest_file_result.ingest_type = ingest_request.ingest_type
- AND ingest_file_result.base_url = ingest_request.base_url
- WHERE
- ingest_request.ingest_type = 'pdf'
- AND ingest_request.link_source = 'oai'
- AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
- AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
- AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
- AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
- AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
- GROUP BY status
- ORDER BY COUNT DESC
- LIMIT 20;
-
- status | count
- -------------------------+----------
- success | 14143967
- no-pdf-link | 12857899
- no-capture | 5501279
- redirect-loop | 2092667
- terminal-bad-status | 747387
- wrong-mimetype | 597212
- link-loop | 542143
- null-body | 93566
- cdx-error | 20514
- petabox-error | 18387
- | 15283
- wayback-error | 13996
- gateway-timeout | 510
- skip-url-blocklist | 184
- wayback-content-error | 145
- bad-redirect | 137
- redirects-exceeded | 120
- bad-gzip-encoding | 116
- timeout | 80
- spn2-cdx-lookup-failure | 58
- (20 rows)
-
-
- SELECT
- oai_prefix,
- COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
- COUNT(*) as total
- FROM (
- SELECT
- ingest_file_result.status as status,
- -- eg "oai:cwi.nl:4881"
- substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
- FROM ingest_request
- LEFT JOIN ingest_file_result
- ON ingest_file_result.ingest_type = ingest_request.ingest_type
- AND ingest_file_result.base_url = ingest_request.base_url
- WHERE
- ingest_request.ingest_type = 'pdf'
- AND ingest_request.link_source = 'oai'
- AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
- AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
- AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
- AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
- AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
- ) t1
- GROUP BY oai_prefix
- ORDER BY total DESC
- LIMIT 25;
-
- oai_prefix | success | total
- --------------------------+---------+---------
- repec | 1133019 | 2783448
- hal | 573019 | 1049607
- hsp.org | 0 | 810281
- www.irgrid.ac.cn | 18007 | 748828
- cds.cern.ch | 74078 | 688091
- americanae.aecid.es | 71309 | 572792
- juser.fz-juelich.de | 23026 | 518551
- espace.library.uq.edu.au | 6645 | 508960
- igi.indrastra.com | 59626 | 478577
- archive.ugent.be | 65269 | 424014
- hrcak.srce.hr | 403719 | 414897
- zir.nsk.hr | 156753 | 397200
- renati.sunedu.gob.pe | 79362 | 388355
- hypotheses.org | 3 | 374296
- rour.neicon.ru | 7997 | 354529
- generic.eprints.org | 263564 | 340470
- invenio.nusl.cz | 6340 | 325867
- evastar-karlsruhe.de | 62277 | 317952
- quod.lib.umich.edu | 5 | 309135
- diva.org | 67917 | 298348
- t2r2.star.titech.ac.jp | 1085 | 289388
- edpsciences.org | 139495 | 284972
- repository.ust.hk | 10243 | 283417
- revues.org | 151156 | 277497
- pure.atira.dk | 13492 | 260754
- (25 rows)
-
-Top counts by OAI prefix and status:
-
- SELECT
- oai_prefix,
- status,
- COUNT((oai_prefix,status))
- FROM (
- SELECT
- ingest_file_result.status as status,
- -- eg "oai:cwi.nl:4881"
- substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
- FROM ingest_request
- LEFT JOIN ingest_file_result
- ON ingest_file_result.ingest_type = ingest_request.ingest_type
- AND ingest_file_result.base_url = ingest_request.base_url
- WHERE
- ingest_request.ingest_type = 'pdf'
- AND ingest_request.link_source = 'oai'
- AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
- AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
- AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
- AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
- AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
- AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
- ) t1
- GROUP BY oai_prefix, status
- ORDER BY COUNT DESC
- LIMIT 40;
-
-
- oai_prefix | status | count
- ---------------------------+---------------+---------
- repec | success | 1133019
- hsp.org | no-pdf-link | 794781
- repec | no-pdf-link | 638124
- hal | success | 573020
- cds.cern.ch | no-capture | 540380
- repec | redirect-loop | 516434
- juser.fz-juelich.de | no-pdf-link | 477881
- americanae.aecid.es | no-pdf-link | 417766
- hrcak.srce.hr | success | 403720
- www.irgrid.ac.cn | no-pdf-link | 370908
- hal | no-pdf-link | 359261
- www.irgrid.ac.cn | no-capture | 355532
- espace.library.uq.edu.au | no-pdf-link | 320479
- igi.indrastra.com | no-pdf-link | 318242
- repec | no-capture | 317062
- invenio.nusl.cz | no-pdf-link | 309802
- rour.neicon.ru | redirect-loop | 300911
- hypotheses.org | no-pdf-link | 300251
- renati.sunedu.gob.pe | no-capture | 282800
- t2r2.star.titech.ac.jp | no-pdf-link | 272045
- generic.eprints.org | success | 263564
- quod.lib.umich.edu | no-pdf-link | 259661
- archive.ugent.be | no-capture | 256164
- evastar-karlsruhe.de | no-pdf-link | 248939
- zir.nsk.hr | link-loop | 226919
- repository.ust.hk | no-pdf-link | 208569
- edoc.mpg.de | no-pdf-link | 199758
- bibliotecadigital.jcyl.es | no-pdf-link | 188433
- orbi.ulg.ac.be | no-pdf-link | 172373
- diva.org | no-capture | 171115
- lup.lub.lu.se | no-pdf-link | 168652
- erudit.org | success | 168490
- ojs.pkp.sfu.ca | success | 168029
- lib.dr.iastate.edu | success | 158494
- zir.nsk.hr | success | 156753
- digital.kenyon.edu | success | 154900
- revues.org | success | 151156
- books.openedition.org | no-pdf-link | 149607
- freidok.uni-freiburg.de | no-pdf-link | 146837
- digitalcommons.unl.edu | success | 144025
- (40 rows)
-
-TODO: also exclude:
-
- oai:nsp.org: (philly historical society)
-
-TODO: more rows for success/total query (aka, increase LIMIT)
-
-TODO: wait until MAG crawl is complete to re-run ingest? otherwise many
-no-capture may actually be (recently) captured. depends on size of MAG crawl I
-guess.
-
-TODO: just delete the "excluded" rows?
-TODO: do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works
-TODO: do random sampling of 'no-pdf-link' URLs, see if newer sandcrawler works
diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md
new file mode 100644
index 0000000..fded7b3
--- /dev/null
+++ b/notes/ingest/2021-09-02_oai_pmh_patch.md
@@ -0,0 +1,1578 @@
+
+Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially
+re-crawling content which failed to ingest the first time.
+
+May fold this in with more general patch crawling.
+
+## Basic Counts
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 14145387
+ no-pdf-link | 12063022
+ no-capture | 5485640
+ redirect-loop | 2092705
+ terminal-bad-status | 747372
+ wrong-mimetype | 597219
+ link-loop | 542144
+ null-body | 93566
+ cdx-error | 19798
+ petabox-error | 17943
+ | 15283
+ wayback-error | 13897
+ gateway-timeout | 511
+ skip-url-blocklist | 184
+ wayback-content-error | 146
+ bad-redirect | 137
+ redirects-exceeded | 120
+ bad-gzip-encoding | 116
+ timeout | 80
+ blocked-cookie | 64
+ (20 rows)
+
+ SELECT
+ oai_prefix,
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
+ COUNT(*) as total
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix
+ ORDER BY total DESC
+ LIMIT 40;
+
+
+ oai_prefix | success | total
+ ---------------------------+---------+---------
+ repec | 1133175 | 2783448
+ hal | 573218 | 1049607
+ www.irgrid.ac.cn | 18007 | 748828
+ cds.cern.ch | 74078 | 688091
+ americanae.aecid.es | 71310 | 572792
+ juser.fz-juelich.de | 23026 | 518551
+ espace.library.uq.edu.au | 6649 | 508960
+ igi.indrastra.com | 59629 | 478577
+ archive.ugent.be | 65306 | 424014
+ hrcak.srce.hr | 404085 | 414897
+ zir.nsk.hr | 156753 | 397200
+ renati.sunedu.gob.pe | 79362 | 388355
+ hypotheses.org | 3 | 374296
+ rour.neicon.ru | 7997 | 354529
+ generic.eprints.org | 263566 | 340470
+ invenio.nusl.cz | 6340 | 325867
+ evastar-karlsruhe.de | 62282 | 317952
+ quod.lib.umich.edu | 5 | 309135
+ diva.org | 67917 | 298348
+ t2r2.star.titech.ac.jp | 1085 | 289388
+ edpsciences.org | 139495 | 284972
+ repository.ust.hk | 10245 | 283417
+ revues.org | 151156 | 277497
+ pure.atira.dk | 13492 | 260754
+ bibliotecadigital.jcyl.es | 50606 | 254134
+ escholarship.org/ark | 140835 | 245203
+ ojs.pkp.sfu.ca | 168029 | 229387
+ lup.lub.lu.se | 49358 | 226602
+ library.wur.nl | 15051 | 216738
+ digitalrepository.unm.edu | 111704 | 211749
+ infoscience.tind.io | 60166 | 207299
+ edoc.mpg.de | 0 | 205252
+ erudit.org | 168490 | 197803
+ delibra.bg.polsl.pl | 38666 | 196652
+ n/a | 0 | 193814
+ aleph.bib-bvb.de | 4349 | 186666
+ serval.unil.ch | 41643 | 186372
+ orbi.ulg.ac.be | 2400 | 184551
+ digitalcommons.unl.edu | 144025 | 184372
+ bib-pubdb1.desy.de | 33525 | 182717
+ (40 rows)
+
+Top counts by OAI prefix and status:
+
+ SELECT
+ oai_prefix,
+ status,
+ COUNT((oai_prefix,status))
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ ) t1
+ GROUP BY oai_prefix, status
+ ORDER BY COUNT DESC
+ LIMIT 50;
+
+ oai_prefix | status | count
+ ---------------------------+---------------+---------
+ repec | success | 1133175
+ repec | no-pdf-link | 638105
+ hal | success | 573218
+ cds.cern.ch | no-capture | 540380
+ repec | redirect-loop | 516451
+ juser.fz-juelich.de | no-pdf-link | 477881
+ americanae.aecid.es | no-pdf-link | 417766
+ hrcak.srce.hr | success | 404085
+ www.irgrid.ac.cn | no-pdf-link | 370908
+ hal | no-pdf-link | 359252
+ www.irgrid.ac.cn | no-capture | 355532
+ espace.library.uq.edu.au | no-pdf-link | 320479
+ igi.indrastra.com | no-pdf-link | 318242
+ repec | no-capture | 316981
+ invenio.nusl.cz | no-pdf-link | 309802
+ rour.neicon.ru | redirect-loop | 300911
+ hypotheses.org | no-pdf-link | 300251
+ renati.sunedu.gob.pe | no-capture | 282800
+ t2r2.star.titech.ac.jp | no-pdf-link | 272045
+ generic.eprints.org | success | 263566
+ quod.lib.umich.edu | no-pdf-link | 259661
+ archive.ugent.be | no-capture | 256127
+ evastar-karlsruhe.de | no-pdf-link | 248939
+ zir.nsk.hr | link-loop | 226919
+ repository.ust.hk | no-pdf-link | 208569
+ edoc.mpg.de | no-pdf-link | 199758
+ bibliotecadigital.jcyl.es | no-pdf-link | 188433
+ orbi.ulg.ac.be | no-pdf-link | 172373
+ diva.org | no-capture | 171115
+ lup.lub.lu.se | no-pdf-link | 168652
+ erudit.org | success | 168490
+ ojs.pkp.sfu.ca | success | 168029
+ lib.dr.iastate.edu | success | 158494
+ zir.nsk.hr | success | 156753
+ digital.kenyon.edu | success | 154900
+ revues.org | success | 151156
+ books.openedition.org | no-pdf-link | 149607
+ freidok.uni-freiburg.de | no-pdf-link | 146837
+ digitalcommons.unl.edu | success | 144025
+ escholarship.org/ark | success | 140835
+ culeuclid | link-loop | 140291
+ edpsciences.org | success | 139495
+ serval.unil.ch | no-pdf-link | 138644
+ bib-pubdb1.desy.de | no-pdf-link | 133815
+ krm.or.kr | no-pdf-link | 132461
+ pure.atira.dk | no-pdf-link | 132179
+ oai-gms.dimdi.de | redirect-loop | 131409
+ aleph.bib-bvb.de | no-capture | 128261
+ library.wur.nl | no-pdf-link | 124718
+ lirias2repo.kuleuven.be | no-capture | 123106
+ (50 rows)
+
+Note: could just delete the "excluded" rows? and not harvest them in the
+future, and filter them at ingest time (in transform script).
+
+
+
+## Investigate no-pdf-link sandcrawler improvements
+
+Do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works:
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ AND ingest_request.link_source_id LIKE 'oai:library.wur.nl:%'
+ ORDER BY random()
+ LIMIT 10;
+
+Random sampling of *all* 'no-pdf-link' URLs (see if newer sandcrawler works):
+
+ \x auto
+
+ SELECT
+ ingest_request.link_source_id AS oai_id,
+ ingest_request.base_url as base_url ,
+ ingest_file_result.terminal_url as terminal_url
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_file_result.status = 'no-pdf-link'
+ ORDER BY random()
+ LIMIT 30;
+
+### repec (SKIP-PREFIX)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351
+base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115
+base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2
+terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------
+--------------------------------------
+oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92
+base_url | http://pz.wz.uw.edu.pl/en
+terminal_url | http://pz.wz.uw.edu.pl:80/en
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:erv:rccsrc:y:2016:i:2016_11:35
+base_url | http://www.eumed.net/rev/caribe/2016/11/estructura.html
+terminal_url | http://www.eumed.net:80/rev/caribe/2016/11/estructura.html
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:pio:envira:v:33:y:2001:i:4:p:629-647
+base_url | http://www.envplan.com/epa/fulltext/a33/a3319.pdf
+terminal_url | http://uk.sagepub.com:80/en-gb/eur/pion-journals-published
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repec:tpr:qjecon:v:100:y:1985:i:3:p:651-75
+base_url | http://links.jstor.org/sici?sici=0033-5533%28198508%29100%3A3%3C651%3ATCOCEA%3E2.0.CO%3B2-2&origin=repec
+terminal_url | https://www.jstor.org/stable/1884373
+
+Huh! This is just a catalog of other domains. Should probably skip
+
+DONE: skip/filter repec
+
+### juser.fz-juelich.de (SCOPE)
+
+-[ RECORD 1 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:132217
+base_url | http://juser.fz-juelich.de/record/132217
+terminal_url | http://juser.fz-juelich.de/record/132217
+
+Poster; no files.
+
+-[ RECORD 2 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:268598
+base_url | http://juser.fz-juelich.de/record/268598
+terminal_url | http://juser.fz-juelich.de/record/268598
+
+Journal.
+
+-[ RECORD 3 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:126613
+base_url | http://juser.fz-juelich.de/record/126613
+terminal_url | http://juser.fz-juelich.de/record/126613
+
+-[ RECORD 4 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:67362
+base_url | http://juser.fz-juelich.de/record/67362
+terminal_url | http://juser.fz-juelich.de/record/67362
+-[ RECORD 5 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:869189
+base_url | http://juser.fz-juelich.de/record/869189
+terminal_url | http://juser.fz-juelich.de/record/869189
+-[ RECORD 6 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:810746
+base_url | http://juser.fz-juelich.de/record/810746
+terminal_url | http://juser.fz-juelich.de/record/810746
+-[ RECORD 7 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:52897
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22
+-[ RECORD 8 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:114755
+base_url | http://juser.fz-juelich.de/record/114755
+terminal_url | http://juser.fz-juelich.de/record/114755
+-[ RECORD 9 ]+------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:58025
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22
+
+The search URLs seem redundant? Not going to try to handle those.
+
+"Powered by Invenio v1.1.7"
+
+All of these examples seem to be not papers. Maybe we can filter these better
+at the harvest or transform stage?
+
+### americanae.aecid.es (MIXED)
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:502896
+base_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+terminal_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai
+
+just a metadata record? links to redalyc
+
+METADATA-ONLY
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:534600
+base_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+terminal_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:524567
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567
+
+NOT-FOUND (404)
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:378914
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914
+
+Some single-page image archival thing? bespoke, skipping.
+
+SKIP-BESPOKE
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:526142
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142
+
+NOT-FOUND (404)
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------
+oai_id | oai:americanae.aecid.es:373408
+base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408
+
+NOT-FOUND (404)
+
+### www.irgrid.ac.cn (SKIP-PREFIX)
+
+Chinese Academy of Sciences Institutional Repositories Grid
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1749980
+base_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1749980
+
+Can't access
+
+FORBIDDEN
+
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/857397
+base_url | http://www.irgrid.ac.cn/handle/1471x/857397
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/857397
+
+Just linking to another IR; skip it.
+
+http://ir.ipe.ac.cn/handle/122111/10608
+
+requires login
+
+DONE: '/password-login;jsessionid' as a loginwall URL pattern
+ http://ir.ipe.ac.cn/handle/122111/10608
+ http://ir.ipe.ac.cn/bitstream/122111/10608/2/%e9%92%9d%e9%a1%b6%e8%9e%ba%e6%97%8b%e8%97%bb%e5%9c%a8%e4%b8%8d%e5%90%8c%e5%85%89%e7%85%a7%e6%9d%a1%e4%bb%b6%e4%b8%8b%e7%9a%84%e6%94%be%e6%b0%a7%e7%89%b9%e6%80%a7_%e8%96%9b%e5%8d%87%e9%95%bf.pdf
+
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1060447
+base_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1060447
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1671377
+base_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+terminal_url | http://ir.iggcas.ac.cn/handle/132A11/68622
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/1178430
+base_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/1178430
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2488017
+base_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/2488017
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/977147
+base_url | http://www.irgrid.ac.cn/handle/1471x/977147
+terminal_url | http://www.irgrid.ac.cn/handle/1471x/977147
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:www.irgrid.ac.cn:1471x/2454503
+base_url | http://ir.nwipb.ac.cn/handle/363003/9957
+terminal_url | http://ir.nwipb.ac.cn/handle/363003/9957
+
+this domain is a disapointment :(
+
+should continue crawling, as the metadata is open and good. but won't get fulltext?
+
+### hal (FIXED-PARTIAL)
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00744951v1
+base_url | https://hal.archives-ouvertes.fr/hal-00744951
+terminal_url | https://hal.archives-ouvertes.fr/hal-00744951
+
+Off-site OA link.
+
+FIXED-HAL
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-01065398v1
+base_url | https://hal.archives-ouvertes.fr/hal-01065398/file/AbstractSGE14_B_assaad.pdf
+terminal_url | https://hal.archives-ouvertes.fr/index/index
+-[ RECORD 3 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:lirmm-00371599v1
+base_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+terminal_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599
+
+To elsevier :(
+
+-[ RECORD 4 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00284780v1
+base_url | https://hal.archives-ouvertes.fr/hal-00284780
+terminal_url | https://hal.archives-ouvertes.fr/hal-00284780
+
+METADATA-ONLY
+
+-[ RECORD 5 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00186151v1
+base_url | https://hal.archives-ouvertes.fr/hal-00186151
+terminal_url | https://hal.archives-ouvertes.fr/hal-00186151
+
+METADATA-ONLY
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------
+oai_id | oai:hal:hal-00399754v1
+base_url | https://hal.archives-ouvertes.fr/hal-00399754
+terminal_url | https://hal.archives-ouvertes.fr/hal-00399754
+
+METADATA-ONLY
+
+
+### espace.library.uq.edu.au (SKIP)
+
+-[ RECORD 1 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:136497
+base_url | https://espace.library.uq.edu.au/view/UQ:136497
+terminal_url | https://espace.library.uq.edu.au/view/UQ:136497
+-[ RECORD 2 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:411389
+base_url | https://espace.library.uq.edu.au/view/UQ:411389
+terminal_url | https://espace.library.uq.edu.au/view/UQ:411389
+-[ RECORD 3 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:401773
+base_url | https://espace.library.uq.edu.au/view/UQ:401773
+terminal_url | https://espace.library.uq.edu.au/view/UQ:401773
+-[ RECORD 4 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:675334
+base_url | https://espace.library.uq.edu.au/view/UQ:675334
+terminal_url | https://espace.library.uq.edu.au/view/UQ:675334
+-[ RECORD 5 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:312311
+base_url | https://espace.library.uq.edu.au/view/UQ:312311
+terminal_url | https://espace.library.uq.edu.au/view/UQ:312311
+-[ RECORD 6 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:209401
+base_url | https://espace.library.uq.edu.au/view/UQ:209401
+terminal_url | https://espace.library.uq.edu.au/view/UQ:209401
+-[ RECORD 7 ]+------------------------------------------------
+oai_id | oai:espace.library.uq.edu.au:uq:327188
+base_url | https://espace.library.uq.edu.au/view/UQ:327188
+terminal_url | https://espace.library.uq.edu.au/view/UQ:327188
+
+Very javascript heavy (skeletal HTML). And just links to fulltext on publisher
+sites.
+
+### igi.indrastra.com (METADATA-ONLY)
+
+-[ RECORD 1 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:267221
+base_url | http://igi.indrastra.com/items/show/267221
+terminal_url | http://igi.indrastra.com/items/show/267221
+-[ RECORD 2 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:181799
+base_url | http://igi.indrastra.com/items/show/181799
+terminal_url | http://igi.indrastra.com/items/show/181799
+-[ RECORD 3 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:125382
+base_url | http://igi.indrastra.com/items/show/125382
+terminal_url | http://igi.indrastra.com/items/show/125382
+-[ RECORD 4 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:47266
+base_url | http://igi.indrastra.com/items/show/47266
+terminal_url | http://igi.indrastra.com/items/show/47266
+-[ RECORD 5 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:12872
+base_url | http://igi.indrastra.com/items/show/12872
+terminal_url | http://igi.indrastra.com/items/show/12872
+-[ RECORD 6 ]+---------------------------------------------------------
+oai_id | oai:igi.indrastra.com:231620
+base_url | http://igi.indrastra.com/items/show/231620
+terminal_url | http://igi.indrastra.com/items/show/231620
+
+"Proudly powered by Omeka"
+
+### invenio.nusl.cz (METADATA-ONLY)
+
+ oai_id | base_url | terminal_url
+----------------------------+------------------------------------+--------------------------------------
+ oai:invenio.nusl.cz:237409 | http://www.nusl.cz/ntk/nusl-237409 | http://invenio.nusl.cz/record/237409
+ oai:invenio.nusl.cz:180783 | http://www.nusl.cz/ntk/nusl-180783 | http://invenio.nusl.cz/record/180783
+ oai:invenio.nusl.cz:231961 | http://www.nusl.cz/ntk/nusl-231961 | http://invenio.nusl.cz/record/231961
+ oai:invenio.nusl.cz:318800 | http://www.nusl.cz/ntk/nusl-318800 | http://invenio.nusl.cz/record/318800
+ oai:invenio.nusl.cz:259695 | http://www.nusl.cz/ntk/nusl-259695 | http://invenio.nusl.cz/record/259695
+ oai:invenio.nusl.cz:167393 | http://www.nusl.cz/ntk/nusl-167393 | http://invenio.nusl.cz/record/167393
+ oai:invenio.nusl.cz:292987 | http://www.nusl.cz/ntk/nusl-292987 | http://invenio.nusl.cz/record/292987
+ oai:invenio.nusl.cz:283396 | http://www.nusl.cz/ntk/nusl-283396 | http://invenio.nusl.cz/record/283396
+ oai:invenio.nusl.cz:241512 | http://www.nusl.cz/ntk/nusl-241512 | http://invenio.nusl.cz/record/241512
+ oai:invenio.nusl.cz:178631 | http://www.nusl.cz/ntk/nusl-178631 | http://invenio.nusl.cz/record/178631
+
+Metadata only (at least this set)
+
+### hypotheses.org
+
+-[ RECORD 1 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:mittelalter/9529
+base_url | http://mittelalter.hypotheses.org/9529
+terminal_url | https://mittelalter.hypotheses.org/9529
+-[ RECORD 2 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/18638
+base_url | http://archivalia.hypotheses.org/18638
+terminal_url | https://archivalia.hypotheses.org/18638
+-[ RECORD 3 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivalia/13614
+base_url | http://archivalia.hypotheses.org/13614
+terminal_url | https://archivalia.hypotheses.org/13614
+-[ RECORD 4 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:teteschercheuses/2785
+base_url | http://teteschercheuses.hypotheses.org/2785
+terminal_url | https://teteschercheuses.hypotheses.org/2785
+-[ RECORD 5 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:altervsego/608
+base_url | http://altervsego.hypotheses.org/608
+terminal_url | http://altervsego.hypotheses.org/608
+-[ RECORD 6 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:archivewk1/21905
+base_url | http://archivewk1.hypotheses.org/21905
+terminal_url | https://archivewk1.hypotheses.org/21905
+-[ RECORD 7 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:slkdiaspo/3321
+base_url | http://slkdiaspo.hypotheses.org/3321
+terminal_url | https://slkdiaspo.hypotheses.org/3321
+-[ RECORD 8 ]+---------------------------------------------
+oai_id | oai:hypotheses.org:diga/280
+base_url | http://diga.hypotheses.org/280
+terminal_url | https://diga.hypotheses.org/280
+
+These are all a big mix... basically blogs. Should continue crawling, but expect no yield.
+
+### t2r2.star.titech.ac.jp (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00105099
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00101346
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50161100
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:00232407
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50120040
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50321440
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50235666
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778
+
+
+### quod.lib.umich.edu
+
+-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0015.003-2
+base_url | http://name.umdl.umich.edu/acf2679.0015.003
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0015.003
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:b14970.0001.001
+base_url | http://name.umdl.umich.edu/B14970.0001.001
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=eebo2;idno=B14970.0001.001
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acf2679.0009.010-3
+base_url | http://name.umdl.umich.edu/ACF2679-1623SOUT-209
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0009.010;node=acf2679.0009.010:3
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-16.006-43
+base_url | http://name.umdl.umich.edu/acg2248.1-16.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-16.006
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg2248.1-14.011-9
+base_url | http://name.umdl.umich.edu/ACG2248-1489LADI-364
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-14.011;node=acg2248.1-14.011:9
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:acg1336.1-24.006-9
+base_url | http://name.umdl.umich.edu/acg1336.1-24.006
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg1336.1-24.006
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------
+oai_id | oai:quod.lib.umich.edu:africanamer.0002.32a
+base_url | http://name.umdl.umich.edu/africanamer.0002.32a
+terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=africanamer;idno=africanamer.0002.32a
+
+These are... issues of journals? Should continue to crawl, but not expect much.
+
+### evastar-karlsruhe.de (METADATA-ONLY)
+
+-[ RECORD 1 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:270011444
+base_url | https://publikationen.bibliothek.kit.edu/270011444
+terminal_url | https://publikationen.bibliothek.kit.edu/270011444
+-[ RECORD 2 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000050117
+base_url | https://publikationen.bibliothek.kit.edu/1000050117
+terminal_url | https://publikationen.bibliothek.kit.edu/1000050117
+-[ RECORD 3 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:362296
+base_url | https://publikationen.bibliothek.kit.edu/362296
+terminal_url | https://publikationen.bibliothek.kit.edu/362296
+-[ RECORD 4 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:23042000
+base_url | https://publikationen.bibliothek.kit.edu/23042000
+terminal_url | https://publikationen.bibliothek.kit.edu/23042000
+-[ RECORD 5 ]+----------------------------------------------------
+oai_id | oai:evastar-karlsruhe.de:1000069945
+base_url | https://publikationen.bibliothek.kit.edu/1000069945
+terminal_url | https://publikationen.bibliothek.kit.edu/1000069945
+
+
+### repository.ust.hk
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-67233
+base_url | http://repository.ust.hk/ir/Record/1783.1-67233
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-67233
+-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-63232
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=A1981KV47900017
+terminal_url | http://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253AA1981KV47900017%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=http%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-2891
+base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=000240035400103
+terminal_url | https://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253A000240035400103%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=https%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com
+-[ RECORD 4 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-56231
+base_url | http://repository.ust.hk/ir/Record/1783.1-56231
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-56231
+
+[...]
+
+-[ RECORD 6 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-24872
+base_url | http://repository.ust.hk/ir/Record/1783.1-24872
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-24872
+-[ RECORD 7 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-3457
+base_url | http://lbdiscover.ust.hk/uresolver?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+terminal_url | http://lbdiscover.ust.hk/uresolver/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations
+-[ RECORD 8 ]+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-73215
+base_url | http://repository.ust.hk/ir/Record/1783.1-73215
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-73215
+
+DONE: gateway.isiknowledge.com is bogus/blocking?
+
+
+### edoc.mpg.de (SKIP-DEPRECATED)
+
+ oai_id | base_url | terminal_url
+------------------------+---------------------------+---------------------------
+ oai:edoc.mpg.de:416650 | http://edoc.mpg.de/416650 | http://edoc.mpg.de/416650
+ oai:edoc.mpg.de:8195 | http://edoc.mpg.de/8195 | http://edoc.mpg.de/8195
+ oai:edoc.mpg.de:379655 | http://edoc.mpg.de/379655 | http://edoc.mpg.de/379655
+ oai:edoc.mpg.de:641179 | http://edoc.mpg.de/641179 | http://edoc.mpg.de/641179
+ oai:edoc.mpg.de:607141 | http://edoc.mpg.de/607141 | http://edoc.mpg.de/607141
+ oai:edoc.mpg.de:544412 | http://edoc.mpg.de/544412 | http://edoc.mpg.de/544412
+ oai:edoc.mpg.de:314531 | http://edoc.mpg.de/314531 | http://edoc.mpg.de/314531
+ oai:edoc.mpg.de:405047 | http://edoc.mpg.de/405047 | http://edoc.mpg.de/405047
+ oai:edoc.mpg.de:239650 | http://edoc.mpg.de/239650 | http://edoc.mpg.de/239650
+ oai:edoc.mpg.de:614852 | http://edoc.mpg.de/614852 | http://edoc.mpg.de/614852
+
+This whole instance seems to have been replaced
+
+### bibliotecadigital.jcyl.es (SKIP-DIGITIZED)
+
+-[ RECORD 1 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000039962
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664
+-[ RECORD 2 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14075
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075
+-[ RECORD 3 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:4842
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842
+-[ RECORD 4 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:14799
+base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799
+-[ RECORD 5 ]+--------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:821
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474
+
+Digitized images as pages; too much to deal with for now.
+
+### orbi.ulg.ac.be
+
+-[ RECORD 1 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/128079
+base_url | https://orbi.uliege.be/handle/2268/128079
+terminal_url | https://orbi.uliege.be/handle/2268/128079
+-[ RECORD 2 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/67659
+base_url | https://orbi.uliege.be/handle/2268/67659
+terminal_url | https://orbi.uliege.be/handle/2268/67659
+-[ RECORD 3 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/35521
+base_url | https://orbi.uliege.be/handle/2268/35521
+terminal_url | https://orbi.uliege.be/handle/2268/35521
+-[ RECORD 4 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/107922
+base_url | https://orbi.uliege.be/handle/2268/107922
+terminal_url | https://orbi.uliege.be/handle/2268/107922
+-[ RECORD 5 ]+----------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/215694
+base_url | https://orbi.uliege.be/handle/2268/215694
+terminal_url | https://orbi.uliege.be/handle/2268/215694
+
+Described below.
+
+### library.wur.nl (FIXED-BESPOKE)
+
+ oai_id | base_url | terminal_url
+ -----------------------------------+------------------------------------------------+------------------------------------------------
+ oai:library.wur.nl:wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939
+ oai:library.wur.nl:wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707
+ oai:library.wur.nl:wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208
+ oai:library.wur.nl:wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378
+ oai:library.wur.nl:wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416
+ oai:library.wur.nl:wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930
+ oai:library.wur.nl:wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076
+ oai:library.wur.nl:wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109
+ oai:library.wur.nl:wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146
+ oai:library.wur.nl:wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922
+ (10 rows)
+
+Seems like a one-off site? But added a pattern.
+
+### pure.atira.dk
+
+-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/a27762fd-0919-4753-af55-00b9b26d02e0
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/215c8b96-a821-4947-bee4-c7470e9fbaf8
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/95d4920a-12c7-4e25-b86c-5f075ea23a38
+base_url | https://www.tandfonline.com/doi/full/10.1080/03057070.2016.1197694
+terminal_url | https://www.tandfonline.com/action/cookieAbsent
+-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.atira.dk:publications/8a2508ee-14c9-4c6a-851a-6db442090f41
+base_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html
+
+Metadata only
+
+DONE: /cookieAbsent is cookie block
+ https://www.tandfonline.com/action/cookieAbsent
+
+### bib-pubdb1.desy.de (FIXED-INVENIO)
+
+-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:96756
+base_url | http://bib-pubdb1.desy.de/record/96756
+terminal_url | http://bib-pubdb1.desy.de/record/96756
+
+Metadata only.
+
+-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:416556
+base_url | http://bib-pubdb1.desy.de/record/416556
+terminal_url | http://bib-pubdb1.desy.de/record/416556
+
+Fixed!
+
+-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:414545
+base_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+terminal_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22
+-[ RECORD 5 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:170169
+base_url | http://bib-pubdb1.desy.de/record/170169
+terminal_url | http://bib-pubdb1.desy.de/record/170169
+-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:191154
+base_url | http://bib-pubdb1.desy.de/record/191154
+terminal_url | http://bib-pubdb1.desy.de/record/191154
+
+Metadata only
+
+-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:155092
+base_url | http://bib-pubdb1.desy.de/record/155092
+terminal_url | http://bib-pubdb1.desy.de/record/155092
+
+Fixed!
+
+-[ RECORD 8 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bib-pubdb1.desy.de:97158
+base_url | http://bib-pubdb1.desy.de/record/97158
+terminal_url | http://bib-pubdb1.desy.de/record/97158
+
+Metadata only
+
+"Powered by Invenio v1.1.7"
+
+Can/should skip the "search" URLs
+
+### serval.unil.ch
+
+-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_60346fc75171
+base_url | https://serval.unil.ch/notice/serval:BIB_60346FC75171
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_60346FC75171
+-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_4db47fc4b593
+base_url | https://serval.unil.ch/notice/serval:BIB_4DB47FC4B593
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_4DB47FC4B593
+-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_57aac24fe115
+base_url | http://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+terminal_url | https://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154
+-[ RECORD 4 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_deabae6baf6c
+base_url | https://serval.unil.ch/notice/serval:BIB_DEABAE6BAF6C
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DEABAE6BAF6C
+-[ RECORD 5 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_a5ec0df1370f
+base_url | https://serval.unil.ch/notice/serval:BIB_A5EC0DF1370F
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253Aed270c26d4a36cefd1bf6a840472abe0ee5556cb5f3b42de708f3ea984775dfd
+-[ RECORD 6 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_080300c2e23c
+base_url | https://serval.unil.ch/resource/serval:BIB_080300C2E23C.P001/REF.pdf
+terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A154453d78a0fb75ffa220f7b6fe73b29447fa6ed048addf31897b41001f44679
+-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_de777dd2b07f
+base_url | https://serval.unil.ch/notice/serval:BIB_DE777DD2B07F
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DE777DD2B07F
+-[ RECORD 8 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:serval.unil.ch:bib_5e824e244c27
+base_url | https://serval.unil.ch/notice/serval:BIB_5E824E244C27
+terminal_url | https://serval.unil.ch/en/notice/serval:BIB_5E824E244C27
+
+Metadata only? See elsewhere.
+
+### Random Links
+
+-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dbc.wroc.pl:41031
+base_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+terminal_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031
+
+This is some platform/package thing. PDF is in an iframe. Platform is "DLibra".
+FIXED-DLIBRA
+
+-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:orbi.ulg.ac.be:2268/174291
+base_url | https://orbi.uliege.be/handle/2268/174291
+terminal_url | https://orbi.uliege.be/handle/2268/174291
+
+DSpace platform. There are multiple files, and little to "select" on.
+
+https://orbi.uliege.be/handle/2268/174200 has only single PDF and easier to work with
+
+PARTIAL-DSPACE
+
+-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.tue.nl:664163
+base_url | http://repository.tue.nl/664163
+terminal_url | http://repository.tue.nl/664163
+
+Ah, this is the Pure platform from Elsevier.
+Redirects to: https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance
+
+FIXED-PURE
+
+
+-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:juser.fz-juelich.de:49579
+base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22
+
+(handled above)
+
+-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/97937
+base_url | https://orcid.org/0000-0002-2066-2082
+terminal_url | https://orcid.org/0000-0002-2066-2082
+
+ORCID! Skip it.
+
+DONE: skip orcid.org in `terminal_url`, and/or at harvest/transform time.
+
+-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:edoc.mpg.de:360269
+base_url | http://edoc.mpg.de/360269
+terminal_url | http://edoc.mpg.de/360269
+
+Seems like this whole repo has disapeared, or been replaced by... pure? maybe a different pure?
+
+DONE: edoc.mpg.de -> pure.mpg.de
+
+-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:books.openedition.org:msha/17716
+base_url | http://books.openedition.org/msha/17716
+terminal_url | https://books.openedition.org/msha/17716
+
+Open edition is free to read HTML, but not PDF (or epub, etc).
+
+TODO: for some? all? openedition books records, try HTML ingest (not PDF ingest)
+
+HTML-WORKED
+
+-[ RECORD 8 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epub.oeaw.ac.at:0x003aba48
+base_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+terminal_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf
+
+requires login
+
+FORBIDDEN
+
+-[ RECORD 9 ]+---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.mit.edu:1721.1/88986
+base_url | https://orcid.org/0000-0002-4147-2560
+terminal_url | https://orcid.org/0000-0002-4147-2560
+
+DONE: skip orcids
+
+-[ RECORD 10 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.ust.hk:1783.1-28786
+base_url | http://repository.ust.hk/ir/Record/1783.1-28786
+terminal_url | http://repository.ust.hk/ir/Record/1783.1-28786
+
+Generator: VuFind 5.1.1
+just a metadata record
+
+METADATA-ONLY
+
+-[ RECORD 11 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:rcin.org.pl:50797
+base_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+terminal_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472
+
+Seems like a software platform? not sure.
+
+METADATA-ONLY
+
+-[ RECORD 12 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dea.lib.unideb.hu:2437/69641
+base_url | http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+terminal_url | https://webpac.lib.unideb.hu/WebPac/CorvinaWeb?action=cclfind&amp;resultview=long&amp;ccltext=idno+bibFSZ1008709
+
+-[ RECORD 13 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/64871
+base_url | http://handle.unsw.edu.au/1959.4/64871
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_62832&context=L
+
+-[ RECORD 14 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.wbc.poznan.pl:225930
+base_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+terminal_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930
+
+SOFT-404
+
+-[ RECORD 15 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:repository.erciyes.edu.tr:105
+base_url | http://repository.erciyes.edu.tr/bilimname/items/show/105
+terminal_url | http://repository.erciyes.edu.tr:80/bilimname/items/show/105
+
+GONE (domain not registered)
+
+-[ RECORD 16 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:37500
+base_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+terminal_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13
+
+Seems like a bespoke site
+
+SKIP-BESPOKE
+
+-[ RECORD 17 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:t2r2.star.titech.ac.jp:50401364
+base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313
+
+METADATA-ONLY
+
+-[ RECORD 18 ]---------------------------------------------------------------------------------------------------------------------
+oai_id | oai:epubs.cclrc.ac.uk:work/4714
+base_url | http://purl.org/net/epubs/work/4714
+terminal_url | https://epubs.stfc.ac.uk/work/4714
+
+It's got a purl! haha.
+
+METADATA-ONLY
+
+------
+
+Another batch! With some repeat domains removed.
+
+-[ RECORD 1 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:cris.vtt.fi:persons/142c030f-ba7b-491a-8669-a361088355cc
+base_url | https://cris.vtt.fi/en/persons/142c030f-ba7b-491a-8669-a361088355cc
+terminal_url | https://cris.vtt.fi/en/persons/oleg-antropov
+
+SKIP
+
+-[ RECORD 2 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-05302014-183910
+base_url | http://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-05302014-183910/
+
+Some software platform? Pretty basic/bespoke
+
+FIXED-PARTIAL
+
+-[ RECORD 3 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecadigital.jcyl.es:10000098246
+base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451
+
+SKIP (see elsewhere)
+
+-[ RECORD 7 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:elektra.cdaea.es:documento.29259
+base_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+terminal_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259
+
+Photo.
+
+SKIP-SCOPE
+
+-[ RECORD 9 ]+-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:unsworks.library.unsw.edu.au:1959.4/unsworks_60829
+base_url | http://handle.unsw.edu.au/1959.4/unsworks_60829
+terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_modsunsworks_60829&context=L
+
+METADATA-ONLY
+
+-[ RECORD 12 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:pure.leuphana.de:publications/7d040cf2-b3b5-4671-8906-76b5bc8d870a
+base_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+terminal_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html
+
+unsure
+
+-[ RECORD 16 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/369344
+base_url | https://library.wur.nl/WebQuery/wurpubs/369344
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/369344
+
+this specific record not OA (but site is fine/fixed)
+
+-[ RECORD 17 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:escholarship.umassmed.edu:oapubs-2146
+base_url | https://escholarship.umassmed.edu/oapubs/1147
+terminal_url | http://escholarship.umassmed.edu/oapubs/1147/
+
+just links to publisher (no content in repo)
+
+-[ RECORD 18 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digitalcommons.usu.edu:wild_facpub-1010
+base_url | https://digitalcommons.usu.edu/wild_facpub/11
+terminal_url | http://digitalcommons.usu.edu/wild_facpub/11/
+
+also just links to publisher (no content in repo)
+
+-[ RECORD 25 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:igi.indrastra.com:306768
+base_url | http://igi.indrastra.com/items/show/306768
+terminal_url | http://igi.indrastra.com/items/show/306768
+
+(see elsewhere)
+
+-[ RECORD 26 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:fau.digital.flvc.org:fau_9804
+base_url | http://purl.flvc.org/fcla/dt/12932
+terminal_url | http://fau.digital.flvc.org/islandora/object/fau%3A9804
+
+Islandora.
+
+-[ RECORD 27 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:dspace.lu.lv:7/16019
+base_url | https://dspace.lu.lv/dspace/handle/7/16019
+terminal_url | https://dspace.lu.lv/dspace/handle/7/16019
+
+LOGINWALL
+
+-[ RECORD 28 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:zir.nsk.hr:umas_218
+base_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+terminal_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218
+
+REMOVED
+
+
+-[ RECORD 29 ]-----------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:digi.ub.uni-heidelberg.de:36390
+base_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+terminal_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5
+
+Book, with chapters, not an individual work.
+
+-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:krm.or.kr:10056135m201r
+base_url | https://www.krm.or.kr/krmts/link.html?dbGubun=SD&m201_id=10056135&res=y
+terminal_url | https://www.krm.or.kr/krmts/search/detailview/research.html?dbGubun=SD&category=Research&m201_id=10056135
+
+research results repository; keep crawling
+
+SKIP-SCOPE
+
+-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:www.db-thueringen.de:dbt_mods_00005191
+base_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+terminal_url | https://www.db-thueringen.de/receive/dbt_mods_00005191
+
+powered by "MyCoRe"
+
+FIXED-MYCORE
+
+-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bibliotecavirtualandalucia.juntadeandalucia.es:1017405
+base_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+terminal_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405
+
+seems to be a general purpose regional library? not research-specific
+
+SKIP-UNSURE
+
+-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:etd.adm.unipi.it:etd-02272019-123644
+base_url | http://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+terminal_url | https://etd.adm.unipi.it/theses/available/etd-02272019-123644/
+
+This specific URL is not available (FORBIDDEN)
+
+others have multiple files, not just a single PDF:
+https://etd.adm.unipi.it/t/etd-09102013-124430/
+
+SKIP-UNSURE
+
+-[ RECORD 9 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:commons.ln.edu.hk:sw_master-5408
+base_url | https://commons.ln.edu.hk/sw_master/4408
+terminal_url | https://commons.ln.edu.hk/sw_master/4408/
+
+worth crawling I guess
+
+METADATA-ONLY
+
+-[ RECORD 10 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:mouseion.jax.org:ssbb1976-1224
+base_url | https://mouseion.jax.org/ssbb1976/225
+terminal_url | https://mouseion.jax.org/ssbb1976/225/
+
+METADATA-ONLY
+
+-[ RECORD 13 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aleph.bib-bvb.de:bvb01-016604343
+base_url | http://bvbm1.bib-bvb.de/webclient/DeliveryManager?pid=176332&custom_att_2=simple_viewer
+terminal_url | http://digital.bib-bvb.de/view/action/singleViewer.do?dvs=1593269021002~476&locale=en_US&VIEWER_URL=/view/action/singleViewer.do?&DELIVERY_RULE_ID=31&frameId=1&usePid1=true&usePid2=true
+
+SOFT-404 / FORBIDDEN (cookie timeout)
+
+-[ RECORD 14 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:bivaldi.gva.es:11740
+base_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+terminal_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740
+
+
+-[ RECORD 16 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:library.wur.nl:wurpubs/443282
+base_url | https://library.wur.nl/WebQuery/wurpubs/443282
+terminal_url | https://library.wur.nl/WebQuery/wurpubs/443282
+
+DIGIBIS platform (like some others)
+
+FIXED-PARTIAL
+
+-[ RECORD 18 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:hal:in2p3-00414135v1
+base_url | http://hal.in2p3.fr/in2p3-00414135
+terminal_url | http://hal.in2p3.fr:80/in2p3-00414135
+
+METADATA-ONLY
+
+-[ RECORD 19 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:aaltodoc.aalto.fi:123456789/13201
+base_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+terminal_url | https://aaltodoc.aalto.fi/handle/123456789/13201
+
+This specific record is not accessible.
+Another: https://aaltodoc.aalto.fi/handle/123456789/38002
+
+DSpace 5.4
+
+Worked (from recent changes)
+
+
+-[ RECORD 20 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+oai_id | oai:sedici.unlp.edu.ar:10915/40144
+base_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+terminal_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view
+
+This is a journal! Cool. Plone software platform.
+
+FIXED
+
+## Top no-capture Domains
+
+Top terminal no-capture domains:
+
+ SELECT domain, COUNT(domain)
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_file_result.status = 'no-capture'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | count
+ -----------------------------------+-------
+ digitalrepository.unm.edu | 94087
+ escholarship.org | 80632
+ ir.opt.ac.cn | 70504
+ idus.us.es | 67908
+ www.cambridge.org | 56376
+ www.ssoar.info | 52534
+ rep.bntu.by | 52127
+ scholarworks.umt.edu | 48546
+ publikationen.ub.uni-frankfurt.de | 46987
+ dk.um.si | 45753
+ repositorio.uladech.edu.pe | 37028
+ uu.diva-portal.org | 34929
+ digitalcommons.law.byu.edu | 31732
+ sedici.unlp.edu.ar | 31233
+ elib.sfu-kras.ru | 29131
+ jyx.jyu.fi | 28144
+ www.repository.cam.ac.uk | 27728
+ nagoya.repo.nii.ac.jp | 26673
+ www.duo.uio.no | 25258
+ www.persee.fr | 24968
+ www2.senado.leg.br | 24426
+ tesis.ucsm.edu.pe | 24049
+ digitalcommons.unl.edu | 21974
+ www.degruyter.com | 21940
+ www.igi-global.com | 20736
+ thekeep.eiu.edu | 20712
+ docs.lib.purdue.edu | 20538
+ repositorio.cepal.org | 20280
+ elib.bsu.by | 19620
+ minds.wisconsin.edu | 19473
+ (30 rows)
+
+These all seem worth crawling. A couple publishers (cambridge.org), and
+persee.fr will probably fail, but not too many URLs.
+
+## Summary of Filtered Prefixes and Domains (OAI-PMH)
+
+oai:kb.dk:
+ too large and generic
+oai:bdr.oai.bsb-muenchen.de:
+ too large and generic
+oai:hispana.mcu.es:
+ too large and generic
+oai:bnf.fr:
+ too large and generic
+oai:ukm.si:
+ too large and generic
+oai:biodiversitylibrary.org:
+ redundant with other ingest and archive.org content
+oai:hsp.org:
+ large; historical content only
+oai:repec:
+ large; mostly (entirely?) links to publisher sites
+oai:n/a:
+ meta?
+oai:quod.lib.umich.edu:
+ entire issues? hard to crawl so skip for now
+oai:hypotheses.org:
+ HTML, not PDF
+oai:americanae.aecid.es:
+ large, complex. skip for now
+oai:www.irgrid.ac.cn:
+ aggregator of other IRs
+oai:espace.library.uq.edu.au:
+ large; metadata only; javascript heavy (poor heritrix crawling)
+oai:edoc.mpg.de:
+ deprecated domain, with no redirects
+oai:bibliotecadigital.jcyl.es:
+ digitized historical docs; hard to crawl, skip for now
+oai:repository.erciyes.edu.tr:
+ gone (domain lapsed)
+oai:krm.or.kr:
+ "research results repository" (metadata only)
+
+www.kb.dk
+ large, general purpose, scope
+kb-images.kb.dk
+ deprecated
+mdz-nbn-resolving.de
+ multiple prefixes end up here. historical docs, scope
+aggr.ukm.um.si
+ large, out of scope
+edoc.mpg.de
+ deprecated domain
+doaj.org
+ index (metadata only)
+orcid.org
+ out of scope
+gateway.isiknowledge.com
+ clarivate login/payall (skipping in ingest)
+
+Needs filtering to a subset of records (by 'set' or other filtering?):
+
+oai:igi.indrastra.com:
+oai:invenio.nusl.cz:
+oai:t2r2.star.titech.ac.jp:
+oai:evastar-karlsruhe.de:
+oai:repository.ust.hk:
+oai:serval.unil.ch:
+oai:pure.atira.dk:
+
+FIlters in SQL syntax:
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+
+and in some contexts (PDFs; switch to HTML):
+
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+
+## Overall Summary of OAI-PMH Stuff
+
+Big picture is that the majority of `no-pdf-link` crawl status are because of
+repository scope, record scope, or content format issues. That being said,
+there was a sizable fraction of sites which were platforms (like DSpace) which
+were not ingesting well.
+
+A significant fraction of records are "metadata only" (of papers), or non-paper
+entity types (like persons, grants, or journal titles), and a growing fraction
+(?) are metadata plus link to OA publisher fulltext (offsite). Might be
+possible to detect these at ingest time, or earlier at OAI-PMH
+harvest/transform time and filter them out.
+
+It may be worthwhile to attempt ingest of multiple existing captures
+(timestamps) in the ingest pipeline. Eg, isntead of chosing a single "best"
+capture, if therea are multiple HTTP 200 status captures, try ingest with each
+(or at least a couple). This is because repository software gets upgraded, so
+old "no-capture" or "not found" or "link loop" type captures may work when
+recrawled.
+
+New summary with additional filters:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -----------------------+----------
+ success | 12872279
+ no-pdf-link | 9329602
+ no-capture | 4696362
+ redirect-loop | 1541458
+ terminal-bad-status | 660418
+ link-loop | 452831
+ wrong-mimetype | 434868
+ null-body | 71065
+ cdx-error | 17005
+ | 15275
+ petabox-error | 12743
+ wayback-error | 11759
+ skip-url-blocklist | 182
+ gateway-timeout | 122
+ redirects-exceeded | 120
+ bad-redirect | 117
+ bad-gzip-encoding | 111
+ wayback-content-error | 102
+ timeout | 72
+ blocked-cookie | 62
+ (20 rows)
+