diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-09-03 18:34:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-09-03 18:34:33 -0700 |
commit | d749a7a6a1c1d439596c5d053daf904638b4dbc2 (patch) | |
tree | fd5ed9dd6dfd688908554ed84fb8caccf426e156 /notes/ingest | |
parent | c147ad35fa8ec59b8c015f8badae67c525f65253 (diff) | |
download | sandcrawler-d749a7a6a1c1d439596c5d053daf904638b4dbc2.tar.gz sandcrawler-d749a7a6a1c1d439596c5d053daf904638b4dbc2.zip |
OAI-PMH patch and ingest improvement notes
Diffstat (limited to 'notes/ingest')
-rw-r--r-- | notes/ingest/2021-08_oai_pmh_patch.md | 204 | ||||
-rw-r--r-- | notes/ingest/2021-09-02_oai_pmh_patch.md | 1578 |
2 files changed, 1578 insertions, 204 deletions
diff --git a/notes/ingest/2021-08_oai_pmh_patch.md b/notes/ingest/2021-08_oai_pmh_patch.md deleted file mode 100644 index 20bb451..0000000 --- a/notes/ingest/2021-08_oai_pmh_patch.md +++ /dev/null @@ -1,204 +0,0 @@ - -Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially -re-crawling content which failed to ingest the first time. - -## Basic Counts - - SELECT ingest_file_result.status, COUNT(*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'oai' - AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' - AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' - AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' - AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' - AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' - AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' - AND ingest_request.base_url NOT LIKE '%www.kb.dk%' - AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' - AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' - AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' - GROUP BY status - ORDER BY COUNT DESC - LIMIT 20; - - status | count - -------------------------+---------- - success | 14143967 - no-pdf-link | 12857899 - no-capture | 5501279 - redirect-loop | 2092667 - terminal-bad-status | 747387 - wrong-mimetype | 597212 - link-loop | 542143 - null-body | 93566 - cdx-error | 20514 - petabox-error | 18387 - | 15283 - wayback-error | 13996 - gateway-timeout | 510 - skip-url-blocklist | 184 - wayback-content-error | 145 - bad-redirect | 137 - redirects-exceeded | 120 - bad-gzip-encoding | 116 - timeout | 80 - spn2-cdx-lookup-failure | 58 - (20 rows) - - - SELECT - oai_prefix, - COUNT(CASE WHEN status = 'success' THEN 1 END) as success, - COUNT(*) as total - FROM ( - SELECT - ingest_file_result.status as status, - -- eg "oai:cwi.nl:4881" - substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'oai' - AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' - AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' - AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' - AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' - AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' - AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' - AND ingest_request.base_url NOT LIKE '%www.kb.dk%' - AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' - AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' - AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' - ) t1 - GROUP BY oai_prefix - ORDER BY total DESC - LIMIT 25; - - oai_prefix | success | total - --------------------------+---------+--------- - repec | 1133019 | 2783448 - hal | 573019 | 1049607 - hsp.org | 0 | 810281 - www.irgrid.ac.cn | 18007 | 748828 - cds.cern.ch | 74078 | 688091 - americanae.aecid.es | 71309 | 572792 - juser.fz-juelich.de | 23026 | 518551 - espace.library.uq.edu.au | 6645 | 508960 - igi.indrastra.com | 59626 | 478577 - archive.ugent.be | 65269 | 424014 - hrcak.srce.hr | 403719 | 414897 - zir.nsk.hr | 156753 | 397200 - renati.sunedu.gob.pe | 79362 | 388355 - hypotheses.org | 3 | 374296 - rour.neicon.ru | 7997 | 354529 - generic.eprints.org | 263564 | 340470 - invenio.nusl.cz | 6340 | 325867 - evastar-karlsruhe.de | 62277 | 317952 - quod.lib.umich.edu | 5 | 309135 - diva.org | 67917 | 298348 - t2r2.star.titech.ac.jp | 1085 | 289388 - edpsciences.org | 139495 | 284972 - repository.ust.hk | 10243 | 283417 - revues.org | 151156 | 277497 - pure.atira.dk | 13492 | 260754 - (25 rows) - -Top counts by OAI prefix and status: - - SELECT - oai_prefix, - status, - COUNT((oai_prefix,status)) - FROM ( - SELECT - ingest_file_result.status as status, - -- eg "oai:cwi.nl:4881" - substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'oai' - AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' - AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' - AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' - AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' - AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' - AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' - AND ingest_request.base_url NOT LIKE '%www.kb.dk%' - AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' - AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' - AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' - ) t1 - GROUP BY oai_prefix, status - ORDER BY COUNT DESC - LIMIT 40; - - - oai_prefix | status | count - ---------------------------+---------------+--------- - repec | success | 1133019 - hsp.org | no-pdf-link | 794781 - repec | no-pdf-link | 638124 - hal | success | 573020 - cds.cern.ch | no-capture | 540380 - repec | redirect-loop | 516434 - juser.fz-juelich.de | no-pdf-link | 477881 - americanae.aecid.es | no-pdf-link | 417766 - hrcak.srce.hr | success | 403720 - www.irgrid.ac.cn | no-pdf-link | 370908 - hal | no-pdf-link | 359261 - www.irgrid.ac.cn | no-capture | 355532 - espace.library.uq.edu.au | no-pdf-link | 320479 - igi.indrastra.com | no-pdf-link | 318242 - repec | no-capture | 317062 - invenio.nusl.cz | no-pdf-link | 309802 - rour.neicon.ru | redirect-loop | 300911 - hypotheses.org | no-pdf-link | 300251 - renati.sunedu.gob.pe | no-capture | 282800 - t2r2.star.titech.ac.jp | no-pdf-link | 272045 - generic.eprints.org | success | 263564 - quod.lib.umich.edu | no-pdf-link | 259661 - archive.ugent.be | no-capture | 256164 - evastar-karlsruhe.de | no-pdf-link | 248939 - zir.nsk.hr | link-loop | 226919 - repository.ust.hk | no-pdf-link | 208569 - edoc.mpg.de | no-pdf-link | 199758 - bibliotecadigital.jcyl.es | no-pdf-link | 188433 - orbi.ulg.ac.be | no-pdf-link | 172373 - diva.org | no-capture | 171115 - lup.lub.lu.se | no-pdf-link | 168652 - erudit.org | success | 168490 - ojs.pkp.sfu.ca | success | 168029 - lib.dr.iastate.edu | success | 158494 - zir.nsk.hr | success | 156753 - digital.kenyon.edu | success | 154900 - revues.org | success | 151156 - books.openedition.org | no-pdf-link | 149607 - freidok.uni-freiburg.de | no-pdf-link | 146837 - digitalcommons.unl.edu | success | 144025 - (40 rows) - -TODO: also exclude: - - oai:nsp.org: (philly historical society) - -TODO: more rows for success/total query (aka, increase LIMIT) - -TODO: wait until MAG crawl is complete to re-run ingest? otherwise many -no-capture may actually be (recently) captured. depends on size of MAG crawl I -guess. - -TODO: just delete the "excluded" rows? -TODO: do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works -TODO: do random sampling of 'no-pdf-link' URLs, see if newer sandcrawler works diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md new file mode 100644 index 0000000..fded7b3 --- /dev/null +++ b/notes/ingest/2021-09-02_oai_pmh_patch.md @@ -0,0 +1,1578 @@ + +Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially +re-crawling content which failed to ingest the first time. + +May fold this in with more general patch crawling. + +## Basic Counts + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -----------------------+---------- + success | 14145387 + no-pdf-link | 12063022 + no-capture | 5485640 + redirect-loop | 2092705 + terminal-bad-status | 747372 + wrong-mimetype | 597219 + link-loop | 542144 + null-body | 93566 + cdx-error | 19798 + petabox-error | 17943 + | 15283 + wayback-error | 13897 + gateway-timeout | 511 + skip-url-blocklist | 184 + wayback-content-error | 146 + bad-redirect | 137 + redirects-exceeded | 120 + bad-gzip-encoding | 116 + timeout | 80 + blocked-cookie | 64 + (20 rows) + + SELECT + oai_prefix, + COUNT(CASE WHEN status = 'success' THEN 1 END) as success, + COUNT(*) as total + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + ) t1 + GROUP BY oai_prefix + ORDER BY total DESC + LIMIT 40; + + + oai_prefix | success | total + ---------------------------+---------+--------- + repec | 1133175 | 2783448 + hal | 573218 | 1049607 + www.irgrid.ac.cn | 18007 | 748828 + cds.cern.ch | 74078 | 688091 + americanae.aecid.es | 71310 | 572792 + juser.fz-juelich.de | 23026 | 518551 + espace.library.uq.edu.au | 6649 | 508960 + igi.indrastra.com | 59629 | 478577 + archive.ugent.be | 65306 | 424014 + hrcak.srce.hr | 404085 | 414897 + zir.nsk.hr | 156753 | 397200 + renati.sunedu.gob.pe | 79362 | 388355 + hypotheses.org | 3 | 374296 + rour.neicon.ru | 7997 | 354529 + generic.eprints.org | 263566 | 340470 + invenio.nusl.cz | 6340 | 325867 + evastar-karlsruhe.de | 62282 | 317952 + quod.lib.umich.edu | 5 | 309135 + diva.org | 67917 | 298348 + t2r2.star.titech.ac.jp | 1085 | 289388 + edpsciences.org | 139495 | 284972 + repository.ust.hk | 10245 | 283417 + revues.org | 151156 | 277497 + pure.atira.dk | 13492 | 260754 + bibliotecadigital.jcyl.es | 50606 | 254134 + escholarship.org/ark | 140835 | 245203 + ojs.pkp.sfu.ca | 168029 | 229387 + lup.lub.lu.se | 49358 | 226602 + library.wur.nl | 15051 | 216738 + digitalrepository.unm.edu | 111704 | 211749 + infoscience.tind.io | 60166 | 207299 + edoc.mpg.de | 0 | 205252 + erudit.org | 168490 | 197803 + delibra.bg.polsl.pl | 38666 | 196652 + n/a | 0 | 193814 + aleph.bib-bvb.de | 4349 | 186666 + serval.unil.ch | 41643 | 186372 + orbi.ulg.ac.be | 2400 | 184551 + digitalcommons.unl.edu | 144025 | 184372 + bib-pubdb1.desy.de | 33525 | 182717 + (40 rows) + +Top counts by OAI prefix and status: + + SELECT + oai_prefix, + status, + COUNT((oai_prefix,status)) + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + ) t1 + GROUP BY oai_prefix, status + ORDER BY COUNT DESC + LIMIT 50; + + oai_prefix | status | count + ---------------------------+---------------+--------- + repec | success | 1133175 + repec | no-pdf-link | 638105 + hal | success | 573218 + cds.cern.ch | no-capture | 540380 + repec | redirect-loop | 516451 + juser.fz-juelich.de | no-pdf-link | 477881 + americanae.aecid.es | no-pdf-link | 417766 + hrcak.srce.hr | success | 404085 + www.irgrid.ac.cn | no-pdf-link | 370908 + hal | no-pdf-link | 359252 + www.irgrid.ac.cn | no-capture | 355532 + espace.library.uq.edu.au | no-pdf-link | 320479 + igi.indrastra.com | no-pdf-link | 318242 + repec | no-capture | 316981 + invenio.nusl.cz | no-pdf-link | 309802 + rour.neicon.ru | redirect-loop | 300911 + hypotheses.org | no-pdf-link | 300251 + renati.sunedu.gob.pe | no-capture | 282800 + t2r2.star.titech.ac.jp | no-pdf-link | 272045 + generic.eprints.org | success | 263566 + quod.lib.umich.edu | no-pdf-link | 259661 + archive.ugent.be | no-capture | 256127 + evastar-karlsruhe.de | no-pdf-link | 248939 + zir.nsk.hr | link-loop | 226919 + repository.ust.hk | no-pdf-link | 208569 + edoc.mpg.de | no-pdf-link | 199758 + bibliotecadigital.jcyl.es | no-pdf-link | 188433 + orbi.ulg.ac.be | no-pdf-link | 172373 + diva.org | no-capture | 171115 + lup.lub.lu.se | no-pdf-link | 168652 + erudit.org | success | 168490 + ojs.pkp.sfu.ca | success | 168029 + lib.dr.iastate.edu | success | 158494 + zir.nsk.hr | success | 156753 + digital.kenyon.edu | success | 154900 + revues.org | success | 151156 + books.openedition.org | no-pdf-link | 149607 + freidok.uni-freiburg.de | no-pdf-link | 146837 + digitalcommons.unl.edu | success | 144025 + escholarship.org/ark | success | 140835 + culeuclid | link-loop | 140291 + edpsciences.org | success | 139495 + serval.unil.ch | no-pdf-link | 138644 + bib-pubdb1.desy.de | no-pdf-link | 133815 + krm.or.kr | no-pdf-link | 132461 + pure.atira.dk | no-pdf-link | 132179 + oai-gms.dimdi.de | redirect-loop | 131409 + aleph.bib-bvb.de | no-capture | 128261 + library.wur.nl | no-pdf-link | 124718 + lirias2repo.kuleuven.be | no-capture | 123106 + (50 rows) + +Note: could just delete the "excluded" rows? and not harvest them in the +future, and filter them at ingest time (in transform script). + + + +## Investigate no-pdf-link sandcrawler improvements + +Do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works: + + SELECT + ingest_request.link_source_id AS oai_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_request.link_source_id LIKE 'oai:library.wur.nl:%' + ORDER BY random() + LIMIT 10; + +Random sampling of *all* 'no-pdf-link' URLs (see if newer sandcrawler works): + + \x auto + + SELECT + ingest_request.link_source_id AS oai_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_file_result.status = 'no-pdf-link' + ORDER BY random() + LIMIT 30; + +### repec (SKIP-PREFIX) + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351 +base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115 +base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92 +base_url | http://pz.wz.uw.edu.pl/en +terminal_url | http://pz.wz.uw.edu.pl:80/en +-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351 +base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115 +base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412 +-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92 +base_url | http://pz.wz.uw.edu.pl/en +terminal_url | http://pz.wz.uw.edu.pl:80/en +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:erv:rccsrc:y:2016:i:2016_11:35 +base_url | http://www.eumed.net/rev/caribe/2016/11/estructura.html +terminal_url | http://www.eumed.net:80/rev/caribe/2016/11/estructura.html +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:pio:envira:v:33:y:2001:i:4:p:629-647 +base_url | http://www.envplan.com/epa/fulltext/a33/a3319.pdf +terminal_url | http://uk.sagepub.com:80/en-gb/eur/pion-journals-published +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:tpr:qjecon:v:100:y:1985:i:3:p:651-75 +base_url | http://links.jstor.org/sici?sici=0033-5533%28198508%29100%3A3%3C651%3ATCOCEA%3E2.0.CO%3B2-2&origin=repec +terminal_url | https://www.jstor.org/stable/1884373 + +Huh! This is just a catalog of other domains. Should probably skip + +DONE: skip/filter repec + +### juser.fz-juelich.de (SCOPE) + +-[ RECORD 1 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:132217 +base_url | http://juser.fz-juelich.de/record/132217 +terminal_url | http://juser.fz-juelich.de/record/132217 + +Poster; no files. + +-[ RECORD 2 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:268598 +base_url | http://juser.fz-juelich.de/record/268598 +terminal_url | http://juser.fz-juelich.de/record/268598 + +Journal. + +-[ RECORD 3 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:126613 +base_url | http://juser.fz-juelich.de/record/126613 +terminal_url | http://juser.fz-juelich.de/record/126613 + +-[ RECORD 4 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:67362 +base_url | http://juser.fz-juelich.de/record/67362 +terminal_url | http://juser.fz-juelich.de/record/67362 +-[ RECORD 5 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:869189 +base_url | http://juser.fz-juelich.de/record/869189 +terminal_url | http://juser.fz-juelich.de/record/869189 +-[ RECORD 6 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:810746 +base_url | http://juser.fz-juelich.de/record/810746 +terminal_url | http://juser.fz-juelich.de/record/810746 +-[ RECORD 7 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:52897 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22 +-[ RECORD 8 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:114755 +base_url | http://juser.fz-juelich.de/record/114755 +terminal_url | http://juser.fz-juelich.de/record/114755 +-[ RECORD 9 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:58025 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22 + +The search URLs seem redundant? Not going to try to handle those. + +"Powered by Invenio v1.1.7" + +All of these examples seem to be not papers. Maybe we can filter these better +at the harvest or transform stage? + +### americanae.aecid.es (MIXED) + +-[ RECORD 1 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:502896 +base_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai +terminal_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai + +just a metadata record? links to redalyc + +METADATA-ONLY + +-[ RECORD 2 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:534600 +base_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1 +terminal_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1 +-[ RECORD 3 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:524567 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567 + +NOT-FOUND (404) + +-[ RECORD 4 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:378914 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914 + +Some single-page image archival thing? bespoke, skipping. + +SKIP-BESPOKE + +-[ RECORD 5 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:526142 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142 + +NOT-FOUND (404) + +-[ RECORD 6 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:373408 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408 + +NOT-FOUND (404) + +### www.irgrid.ac.cn (SKIP-PREFIX) + +Chinese Academy of Sciences Institutional Repositories Grid + +-[ RECORD 1 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1749980 +base_url | http://www.irgrid.ac.cn/handle/1471x/1749980 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1749980 + +Can't access + +FORBIDDEN + +-[ RECORD 2 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/857397 +base_url | http://www.irgrid.ac.cn/handle/1471x/857397 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/857397 + +Just linking to another IR; skip it. + +http://ir.ipe.ac.cn/handle/122111/10608 + +requires login + +DONE: '/password-login;jsessionid' as a loginwall URL pattern + http://ir.ipe.ac.cn/handle/122111/10608 + http://ir.ipe.ac.cn/bitstream/122111/10608/2/%e9%92%9d%e9%a1%b6%e8%9e%ba%e6%97%8b%e8%97%bb%e5%9c%a8%e4%b8%8d%e5%90%8c%e5%85%89%e7%85%a7%e6%9d%a1%e4%bb%b6%e4%b8%8b%e7%9a%84%e6%94%be%e6%b0%a7%e7%89%b9%e6%80%a7_%e8%96%9b%e5%8d%87%e9%95%bf.pdf + +-[ RECORD 3 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1060447 +base_url | http://www.irgrid.ac.cn/handle/1471x/1060447 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1060447 +-[ RECORD 4 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1671377 +base_url | http://ir.iggcas.ac.cn/handle/132A11/68622 +terminal_url | http://ir.iggcas.ac.cn/handle/132A11/68622 +-[ RECORD 5 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1178430 +base_url | http://www.irgrid.ac.cn/handle/1471x/1178430 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1178430 +-[ RECORD 6 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/2488017 +base_url | http://www.irgrid.ac.cn/handle/1471x/2488017 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/2488017 +-[ RECORD 7 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/977147 +base_url | http://www.irgrid.ac.cn/handle/1471x/977147 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/977147 +-[ RECORD 8 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/2454503 +base_url | http://ir.nwipb.ac.cn/handle/363003/9957 +terminal_url | http://ir.nwipb.ac.cn/handle/363003/9957 + +this domain is a disapointment :( + +should continue crawling, as the metadata is open and good. but won't get fulltext? + +### hal (FIXED-PARTIAL) + +-[ RECORD 1 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00744951v1 +base_url | https://hal.archives-ouvertes.fr/hal-00744951 +terminal_url | https://hal.archives-ouvertes.fr/hal-00744951 + +Off-site OA link. + +FIXED-HAL + +-[ RECORD 2 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-01065398v1 +base_url | https://hal.archives-ouvertes.fr/hal-01065398/file/AbstractSGE14_B_assaad.pdf +terminal_url | https://hal.archives-ouvertes.fr/index/index +-[ RECORD 3 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:lirmm-00371599v1 +base_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599 +terminal_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599 + +To elsevier :( + +-[ RECORD 4 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00284780v1 +base_url | https://hal.archives-ouvertes.fr/hal-00284780 +terminal_url | https://hal.archives-ouvertes.fr/hal-00284780 + +METADATA-ONLY + +-[ RECORD 5 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00186151v1 +base_url | https://hal.archives-ouvertes.fr/hal-00186151 +terminal_url | https://hal.archives-ouvertes.fr/hal-00186151 + +METADATA-ONLY + +-[ RECORD 6 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00399754v1 +base_url | https://hal.archives-ouvertes.fr/hal-00399754 +terminal_url | https://hal.archives-ouvertes.fr/hal-00399754 + +METADATA-ONLY + + +### espace.library.uq.edu.au (SKIP) + +-[ RECORD 1 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:136497 +base_url | https://espace.library.uq.edu.au/view/UQ:136497 +terminal_url | https://espace.library.uq.edu.au/view/UQ:136497 +-[ RECORD 2 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:411389 +base_url | https://espace.library.uq.edu.au/view/UQ:411389 +terminal_url | https://espace.library.uq.edu.au/view/UQ:411389 +-[ RECORD 3 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:401773 +base_url | https://espace.library.uq.edu.au/view/UQ:401773 +terminal_url | https://espace.library.uq.edu.au/view/UQ:401773 +-[ RECORD 4 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:675334 +base_url | https://espace.library.uq.edu.au/view/UQ:675334 +terminal_url | https://espace.library.uq.edu.au/view/UQ:675334 +-[ RECORD 5 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:312311 +base_url | https://espace.library.uq.edu.au/view/UQ:312311 +terminal_url | https://espace.library.uq.edu.au/view/UQ:312311 +-[ RECORD 6 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:209401 +base_url | https://espace.library.uq.edu.au/view/UQ:209401 +terminal_url | https://espace.library.uq.edu.au/view/UQ:209401 +-[ RECORD 7 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:327188 +base_url | https://espace.library.uq.edu.au/view/UQ:327188 +terminal_url | https://espace.library.uq.edu.au/view/UQ:327188 + +Very javascript heavy (skeletal HTML). And just links to fulltext on publisher +sites. + +### igi.indrastra.com (METADATA-ONLY) + +-[ RECORD 1 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:267221 +base_url | http://igi.indrastra.com/items/show/267221 +terminal_url | http://igi.indrastra.com/items/show/267221 +-[ RECORD 2 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:181799 +base_url | http://igi.indrastra.com/items/show/181799 +terminal_url | http://igi.indrastra.com/items/show/181799 +-[ RECORD 3 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:125382 +base_url | http://igi.indrastra.com/items/show/125382 +terminal_url | http://igi.indrastra.com/items/show/125382 +-[ RECORD 4 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:47266 +base_url | http://igi.indrastra.com/items/show/47266 +terminal_url | http://igi.indrastra.com/items/show/47266 +-[ RECORD 5 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:12872 +base_url | http://igi.indrastra.com/items/show/12872 +terminal_url | http://igi.indrastra.com/items/show/12872 +-[ RECORD 6 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:231620 +base_url | http://igi.indrastra.com/items/show/231620 +terminal_url | http://igi.indrastra.com/items/show/231620 + +"Proudly powered by Omeka" + +### invenio.nusl.cz (METADATA-ONLY) + + oai_id | base_url | terminal_url +----------------------------+------------------------------------+-------------------------------------- + oai:invenio.nusl.cz:237409 | http://www.nusl.cz/ntk/nusl-237409 | http://invenio.nusl.cz/record/237409 + oai:invenio.nusl.cz:180783 | http://www.nusl.cz/ntk/nusl-180783 | http://invenio.nusl.cz/record/180783 + oai:invenio.nusl.cz:231961 | http://www.nusl.cz/ntk/nusl-231961 | http://invenio.nusl.cz/record/231961 + oai:invenio.nusl.cz:318800 | http://www.nusl.cz/ntk/nusl-318800 | http://invenio.nusl.cz/record/318800 + oai:invenio.nusl.cz:259695 | http://www.nusl.cz/ntk/nusl-259695 | http://invenio.nusl.cz/record/259695 + oai:invenio.nusl.cz:167393 | http://www.nusl.cz/ntk/nusl-167393 | http://invenio.nusl.cz/record/167393 + oai:invenio.nusl.cz:292987 | http://www.nusl.cz/ntk/nusl-292987 | http://invenio.nusl.cz/record/292987 + oai:invenio.nusl.cz:283396 | http://www.nusl.cz/ntk/nusl-283396 | http://invenio.nusl.cz/record/283396 + oai:invenio.nusl.cz:241512 | http://www.nusl.cz/ntk/nusl-241512 | http://invenio.nusl.cz/record/241512 + oai:invenio.nusl.cz:178631 | http://www.nusl.cz/ntk/nusl-178631 | http://invenio.nusl.cz/record/178631 + +Metadata only (at least this set) + +### hypotheses.org + +-[ RECORD 1 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:mittelalter/9529 +base_url | http://mittelalter.hypotheses.org/9529 +terminal_url | https://mittelalter.hypotheses.org/9529 +-[ RECORD 2 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivalia/18638 +base_url | http://archivalia.hypotheses.org/18638 +terminal_url | https://archivalia.hypotheses.org/18638 +-[ RECORD 3 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivalia/13614 +base_url | http://archivalia.hypotheses.org/13614 +terminal_url | https://archivalia.hypotheses.org/13614 +-[ RECORD 4 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:teteschercheuses/2785 +base_url | http://teteschercheuses.hypotheses.org/2785 +terminal_url | https://teteschercheuses.hypotheses.org/2785 +-[ RECORD 5 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:altervsego/608 +base_url | http://altervsego.hypotheses.org/608 +terminal_url | http://altervsego.hypotheses.org/608 +-[ RECORD 6 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivewk1/21905 +base_url | http://archivewk1.hypotheses.org/21905 +terminal_url | https://archivewk1.hypotheses.org/21905 +-[ RECORD 7 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:slkdiaspo/3321 +base_url | http://slkdiaspo.hypotheses.org/3321 +terminal_url | https://slkdiaspo.hypotheses.org/3321 +-[ RECORD 8 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:diga/280 +base_url | http://diga.hypotheses.org/280 +terminal_url | https://diga.hypotheses.org/280 + +These are all a big mix... basically blogs. Should continue crawling, but expect no yield. + +### t2r2.star.titech.ac.jp (METADATA-ONLY) + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00105099 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00101346 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50161100 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554 +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00232407 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528 +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50120040 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598 +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50321440 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492 +-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50235666 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778 + + +### quod.lib.umich.edu + +-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acf2679.0015.003-2 +base_url | http://name.umdl.umich.edu/acf2679.0015.003 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0015.003 +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:b14970.0001.001 +base_url | http://name.umdl.umich.edu/B14970.0001.001 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=eebo2;idno=B14970.0001.001 +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acf2679.0009.010-3 +base_url | http://name.umdl.umich.edu/ACF2679-1623SOUT-209 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0009.010;node=acf2679.0009.010:3 +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg2248.1-16.006-43 +base_url | http://name.umdl.umich.edu/acg2248.1-16.006 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-16.006 +-[ RECORD 5 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg2248.1-14.011-9 +base_url | http://name.umdl.umich.edu/ACG2248-1489LADI-364 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-14.011;node=acg2248.1-14.011:9 +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg1336.1-24.006-9 +base_url | http://name.umdl.umich.edu/acg1336.1-24.006 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg1336.1-24.006 +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:africanamer.0002.32a +base_url | http://name.umdl.umich.edu/africanamer.0002.32a +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=africanamer;idno=africanamer.0002.32a + +These are... issues of journals? Should continue to crawl, but not expect much. + +### evastar-karlsruhe.de (METADATA-ONLY) + +-[ RECORD 1 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:270011444 +base_url | https://publikationen.bibliothek.kit.edu/270011444 +terminal_url | https://publikationen.bibliothek.kit.edu/270011444 +-[ RECORD 2 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:1000050117 +base_url | https://publikationen.bibliothek.kit.edu/1000050117 +terminal_url | https://publikationen.bibliothek.kit.edu/1000050117 +-[ RECORD 3 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:362296 +base_url | https://publikationen.bibliothek.kit.edu/362296 +terminal_url | https://publikationen.bibliothek.kit.edu/362296 +-[ RECORD 4 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:23042000 +base_url | https://publikationen.bibliothek.kit.edu/23042000 +terminal_url | https://publikationen.bibliothek.kit.edu/23042000 +-[ RECORD 5 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:1000069945 +base_url | https://publikationen.bibliothek.kit.edu/1000069945 +terminal_url | https://publikationen.bibliothek.kit.edu/1000069945 + + +### repository.ust.hk + +-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-67233 +base_url | http://repository.ust.hk/ir/Record/1783.1-67233 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-67233 +-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-63232 +base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=A1981KV47900017 +terminal_url | http://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253AA1981KV47900017%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=http%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com +-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-2891 +base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=000240035400103 +terminal_url | https://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253A000240035400103%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=https%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com +-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-56231 +base_url | http://repository.ust.hk/ir/Record/1783.1-56231 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-56231 + +[...] + +-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-24872 +base_url | http://repository.ust.hk/ir/Record/1783.1-24872 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-24872 +-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-3457 +base_url | http://lbdiscover.ust.hk/uresolver?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations +terminal_url | http://lbdiscover.ust.hk/uresolver/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations +-[ RECORD 8 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-73215 +base_url | http://repository.ust.hk/ir/Record/1783.1-73215 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-73215 + +DONE: gateway.isiknowledge.com is bogus/blocking? + + +### edoc.mpg.de (SKIP-DEPRECATED) + + oai_id | base_url | terminal_url +------------------------+---------------------------+--------------------------- + oai:edoc.mpg.de:416650 | http://edoc.mpg.de/416650 | http://edoc.mpg.de/416650 + oai:edoc.mpg.de:8195 | http://edoc.mpg.de/8195 | http://edoc.mpg.de/8195 + oai:edoc.mpg.de:379655 | http://edoc.mpg.de/379655 | http://edoc.mpg.de/379655 + oai:edoc.mpg.de:641179 | http://edoc.mpg.de/641179 | http://edoc.mpg.de/641179 + oai:edoc.mpg.de:607141 | http://edoc.mpg.de/607141 | http://edoc.mpg.de/607141 + oai:edoc.mpg.de:544412 | http://edoc.mpg.de/544412 | http://edoc.mpg.de/544412 + oai:edoc.mpg.de:314531 | http://edoc.mpg.de/314531 | http://edoc.mpg.de/314531 + oai:edoc.mpg.de:405047 | http://edoc.mpg.de/405047 | http://edoc.mpg.de/405047 + oai:edoc.mpg.de:239650 | http://edoc.mpg.de/239650 | http://edoc.mpg.de/239650 + oai:edoc.mpg.de:614852 | http://edoc.mpg.de/614852 | http://edoc.mpg.de/614852 + +This whole instance seems to have been replaced + +### bibliotecadigital.jcyl.es (SKIP-DIGITIZED) + +-[ RECORD 1 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:10000039962 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664 +-[ RECORD 2 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:14075 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075 +-[ RECORD 3 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:4842 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842 +-[ RECORD 4 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:14799 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799 +-[ RECORD 5 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:821 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474 + +Digitized images as pages; too much to deal with for now. + +### orbi.ulg.ac.be + +-[ RECORD 1 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/128079 +base_url | https://orbi.uliege.be/handle/2268/128079 +terminal_url | https://orbi.uliege.be/handle/2268/128079 +-[ RECORD 2 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/67659 +base_url | https://orbi.uliege.be/handle/2268/67659 +terminal_url | https://orbi.uliege.be/handle/2268/67659 +-[ RECORD 3 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/35521 +base_url | https://orbi.uliege.be/handle/2268/35521 +terminal_url | https://orbi.uliege.be/handle/2268/35521 +-[ RECORD 4 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/107922 +base_url | https://orbi.uliege.be/handle/2268/107922 +terminal_url | https://orbi.uliege.be/handle/2268/107922 +-[ RECORD 5 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/215694 +base_url | https://orbi.uliege.be/handle/2268/215694 +terminal_url | https://orbi.uliege.be/handle/2268/215694 + +Described below. + +### library.wur.nl (FIXED-BESPOKE) + + oai_id | base_url | terminal_url + -----------------------------------+------------------------------------------------+------------------------------------------------ + oai:library.wur.nl:wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 + oai:library.wur.nl:wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 + oai:library.wur.nl:wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 + oai:library.wur.nl:wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 + oai:library.wur.nl:wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 + oai:library.wur.nl:wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 + oai:library.wur.nl:wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 + oai:library.wur.nl:wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 + oai:library.wur.nl:wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 + oai:library.wur.nl:wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 + (10 rows) + +Seems like a one-off site? But added a pattern. + +### pure.atira.dk + +-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/a27762fd-0919-4753-af55-00b9b26d02e0 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/215c8b96-a821-4947-bee4-c7470e9fbaf8 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/95d4920a-12c7-4e25-b86c-5f075ea23a38 +base_url | https://www.tandfonline.com/doi/full/10.1080/03057070.2016.1197694 +terminal_url | https://www.tandfonline.com/action/cookieAbsent +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/8a2508ee-14c9-4c6a-851a-6db442090f41 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html + +Metadata only + +DONE: /cookieAbsent is cookie block + https://www.tandfonline.com/action/cookieAbsent + +### bib-pubdb1.desy.de (FIXED-INVENIO) + +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:96756 +base_url | http://bib-pubdb1.desy.de/record/96756 +terminal_url | http://bib-pubdb1.desy.de/record/96756 + +Metadata only. + +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:416556 +base_url | http://bib-pubdb1.desy.de/record/416556 +terminal_url | http://bib-pubdb1.desy.de/record/416556 + +Fixed! + +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:414545 +base_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22 +terminal_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22 +-[ RECORD 5 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:170169 +base_url | http://bib-pubdb1.desy.de/record/170169 +terminal_url | http://bib-pubdb1.desy.de/record/170169 +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:191154 +base_url | http://bib-pubdb1.desy.de/record/191154 +terminal_url | http://bib-pubdb1.desy.de/record/191154 + +Metadata only + +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:155092 +base_url | http://bib-pubdb1.desy.de/record/155092 +terminal_url | http://bib-pubdb1.desy.de/record/155092 + +Fixed! + +-[ RECORD 8 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:97158 +base_url | http://bib-pubdb1.desy.de/record/97158 +terminal_url | http://bib-pubdb1.desy.de/record/97158 + +Metadata only + +"Powered by Invenio v1.1.7" + +Can/should skip the "search" URLs + +### serval.unil.ch + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_60346fc75171 +base_url | https://serval.unil.ch/notice/serval:BIB_60346FC75171 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_60346FC75171 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_4db47fc4b593 +base_url | https://serval.unil.ch/notice/serval:BIB_4DB47FC4B593 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_4DB47FC4B593 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_57aac24fe115 +base_url | http://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154 +terminal_url | https://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154 +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_deabae6baf6c +base_url | https://serval.unil.ch/notice/serval:BIB_DEABAE6BAF6C +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DEABAE6BAF6C +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_a5ec0df1370f +base_url | https://serval.unil.ch/notice/serval:BIB_A5EC0DF1370F +terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253Aed270c26d4a36cefd1bf6a840472abe0ee5556cb5f3b42de708f3ea984775dfd +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_080300c2e23c +base_url | https://serval.unil.ch/resource/serval:BIB_080300C2E23C.P001/REF.pdf +terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A154453d78a0fb75ffa220f7b6fe73b29447fa6ed048addf31897b41001f44679 +-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_de777dd2b07f +base_url | https://serval.unil.ch/notice/serval:BIB_DE777DD2B07F +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DE777DD2B07F +-[ RECORD 8 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_5e824e244c27 +base_url | https://serval.unil.ch/notice/serval:BIB_5E824E244C27 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_5E824E244C27 + +Metadata only? See elsewhere. + +### Random Links + +-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dbc.wroc.pl:41031 +base_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031 +terminal_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031 + +This is some platform/package thing. PDF is in an iframe. Platform is "DLibra". +FIXED-DLIBRA + +-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/174291 +base_url | https://orbi.uliege.be/handle/2268/174291 +terminal_url | https://orbi.uliege.be/handle/2268/174291 + +DSpace platform. There are multiple files, and little to "select" on. + +https://orbi.uliege.be/handle/2268/174200 has only single PDF and easier to work with + +PARTIAL-DSPACE + +-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:library.tue.nl:664163 +base_url | http://repository.tue.nl/664163 +terminal_url | http://repository.tue.nl/664163 + +Ah, this is the Pure platform from Elsevier. +Redirects to: https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance + +FIXED-PURE + + +-[ RECORD 4 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:juser.fz-juelich.de:49579 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22 + +(handled above) + +-[ RECORD 5 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.mit.edu:1721.1/97937 +base_url | https://orcid.org/0000-0002-2066-2082 +terminal_url | https://orcid.org/0000-0002-2066-2082 + +ORCID! Skip it. + +DONE: skip orcid.org in `terminal_url`, and/or at harvest/transform time. + +-[ RECORD 6 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:edoc.mpg.de:360269 +base_url | http://edoc.mpg.de/360269 +terminal_url | http://edoc.mpg.de/360269 + +Seems like this whole repo has disapeared, or been replaced by... pure? maybe a different pure? + +DONE: edoc.mpg.de -> pure.mpg.de + +-[ RECORD 7 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:books.openedition.org:msha/17716 +base_url | http://books.openedition.org/msha/17716 +terminal_url | https://books.openedition.org/msha/17716 + +Open edition is free to read HTML, but not PDF (or epub, etc). + +TODO: for some? all? openedition books records, try HTML ingest (not PDF ingest) + +HTML-WORKED + +-[ RECORD 8 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:epub.oeaw.ac.at:0x003aba48 +base_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf +terminal_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf + +requires login + +FORBIDDEN + +-[ RECORD 9 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.mit.edu:1721.1/88986 +base_url | https://orcid.org/0000-0002-4147-2560 +terminal_url | https://orcid.org/0000-0002-4147-2560 + +DONE: skip orcids + +-[ RECORD 10 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-28786 +base_url | http://repository.ust.hk/ir/Record/1783.1-28786 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-28786 + +Generator: VuFind 5.1.1 +just a metadata record + +METADATA-ONLY + +-[ RECORD 11 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:rcin.org.pl:50797 +base_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472 +terminal_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472 + +Seems like a software platform? not sure. + +METADATA-ONLY + +-[ RECORD 12 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dea.lib.unideb.hu:2437/69641 +base_url | http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&resultview=long&ccltext=idno+bibFSZ1008709 +terminal_url | https://webpac.lib.unideb.hu/WebPac/CorvinaWeb?action=cclfind&resultview=long&ccltext=idno+bibFSZ1008709 + +-[ RECORD 13 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:unsworks.library.unsw.edu.au:1959.4/64871 +base_url | http://handle.unsw.edu.au/1959.4/64871 +terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_62832&context=L + +-[ RECORD 14 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:www.wbc.poznan.pl:225930 +base_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930 +terminal_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930 + +SOFT-404 + +-[ RECORD 15 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.erciyes.edu.tr:105 +base_url | http://repository.erciyes.edu.tr/bilimname/items/show/105 +terminal_url | http://repository.erciyes.edu.tr:80/bilimname/items/show/105 + +GONE (domain not registered) + +-[ RECORD 16 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digi.ub.uni-heidelberg.de:37500 +base_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13 +terminal_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13 + +Seems like a bespoke site + +SKIP-BESPOKE + +-[ RECORD 17 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50401364 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313 + +METADATA-ONLY + +-[ RECORD 18 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:epubs.cclrc.ac.uk:work/4714 +base_url | http://purl.org/net/epubs/work/4714 +terminal_url | https://epubs.stfc.ac.uk/work/4714 + +It's got a purl! haha. + +METADATA-ONLY + +------ + +Another batch! With some repeat domains removed. + +-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:cris.vtt.fi:persons/142c030f-ba7b-491a-8669-a361088355cc +base_url | https://cris.vtt.fi/en/persons/142c030f-ba7b-491a-8669-a361088355cc +terminal_url | https://cris.vtt.fi/en/persons/oleg-antropov + +SKIP + +-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:etd.adm.unipi.it:etd-05302014-183910 +base_url | http://etd.adm.unipi.it/theses/available/etd-05302014-183910/ +terminal_url | https://etd.adm.unipi.it/theses/available/etd-05302014-183910/ + +Some software platform? Pretty basic/bespoke + +FIXED-PARTIAL + +-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:10000098246 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451 + +SKIP (see elsewhere) + +-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:elektra.cdaea.es:documento.29259 +base_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259 +terminal_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259 + +Photo. + +SKIP-SCOPE + +-[ RECORD 9 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:unsworks.library.unsw.edu.au:1959.4/unsworks_60829 +base_url | http://handle.unsw.edu.au/1959.4/unsworks_60829 +terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_modsunsworks_60829&context=L + +METADATA-ONLY + +-[ RECORD 12 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:pure.leuphana.de:publications/7d040cf2-b3b5-4671-8906-76b5bc8d870a +base_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html +terminal_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html + +unsure + +-[ RECORD 16 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:library.wur.nl:wurpubs/369344 +base_url | https://library.wur.nl/WebQuery/wurpubs/369344 +terminal_url | https://library.wur.nl/WebQuery/wurpubs/369344 + +this specific record not OA (but site is fine/fixed) + +-[ RECORD 17 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:escholarship.umassmed.edu:oapubs-2146 +base_url | https://escholarship.umassmed.edu/oapubs/1147 +terminal_url | http://escholarship.umassmed.edu/oapubs/1147/ + +just links to publisher (no content in repo) + +-[ RECORD 18 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digitalcommons.usu.edu:wild_facpub-1010 +base_url | https://digitalcommons.usu.edu/wild_facpub/11 +terminal_url | http://digitalcommons.usu.edu/wild_facpub/11/ + +also just links to publisher (no content in repo) + +-[ RECORD 25 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:igi.indrastra.com:306768 +base_url | http://igi.indrastra.com/items/show/306768 +terminal_url | http://igi.indrastra.com/items/show/306768 + +(see elsewhere) + +-[ RECORD 26 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:fau.digital.flvc.org:fau_9804 +base_url | http://purl.flvc.org/fcla/dt/12932 +terminal_url | http://fau.digital.flvc.org/islandora/object/fau%3A9804 + +Islandora. + +-[ RECORD 27 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.lu.lv:7/16019 +base_url | https://dspace.lu.lv/dspace/handle/7/16019 +terminal_url | https://dspace.lu.lv/dspace/handle/7/16019 + +LOGINWALL + +-[ RECORD 28 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:zir.nsk.hr:umas_218 +base_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218 +terminal_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218 + +REMOVED + + +-[ RECORD 29 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digi.ub.uni-heidelberg.de:36390 +base_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5 +terminal_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5 + +Book, with chapters, not an individual work. + +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:krm.or.kr:10056135m201r +base_url | https://www.krm.or.kr/krmts/link.html?dbGubun=SD&m201_id=10056135&res=y +terminal_url | https://www.krm.or.kr/krmts/search/detailview/research.html?dbGubun=SD&category=Research&m201_id=10056135 + +research results repository; keep crawling + +SKIP-SCOPE + +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:www.db-thueringen.de:dbt_mods_00005191 +base_url | https://www.db-thueringen.de/receive/dbt_mods_00005191 +terminal_url | https://www.db-thueringen.de/receive/dbt_mods_00005191 + +powered by "MyCoRe" + +FIXED-MYCORE + +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:bibliotecavirtualandalucia.juntadeandalucia.es:1017405 +base_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405 +terminal_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405 + +seems to be a general purpose regional library? not research-specific + +SKIP-UNSURE + +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:etd.adm.unipi.it:etd-02272019-123644 +base_url | http://etd.adm.unipi.it/theses/available/etd-02272019-123644/ +terminal_url | https://etd.adm.unipi.it/theses/available/etd-02272019-123644/ + +This specific URL is not available (FORBIDDEN) + +others have multiple files, not just a single PDF: +https://etd.adm.unipi.it/t/etd-09102013-124430/ + +SKIP-UNSURE + +-[ RECORD 9 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:commons.ln.edu.hk:sw_master-5408 +base_url | https://commons.ln.edu.hk/sw_master/4408 +terminal_url | https://commons.ln.edu.hk/sw_master/4408/ + +worth crawling I guess + +METADATA-ONLY + +-[ RECORD 10 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:mouseion.jax.org:ssbb1976-1224 +base_url | https://mouseion.jax.org/ssbb1976/225 +terminal_url | https://mouseion.jax.org/ssbb1976/225/ + +METADATA-ONLY + +-[ RECORD 13 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:aleph.bib-bvb.de:bvb01-016604343 +base_url | http://bvbm1.bib-bvb.de/webclient/DeliveryManager?pid=176332&custom_att_2=simple_viewer +terminal_url | http://digital.bib-bvb.de/view/action/singleViewer.do?dvs=1593269021002~476&locale=en_US&VIEWER_URL=/view/action/singleViewer.do?&DELIVERY_RULE_ID=31&frameId=1&usePid1=true&usePid2=true + +SOFT-404 / FORBIDDEN (cookie timeout) + +-[ RECORD 14 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:bivaldi.gva.es:11740 +base_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740 +terminal_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740 + + +-[ RECORD 16 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:library.wur.nl:wurpubs/443282 +base_url | https://library.wur.nl/WebQuery/wurpubs/443282 +terminal_url | https://library.wur.nl/WebQuery/wurpubs/443282 + +DIGIBIS platform (like some others) + +FIXED-PARTIAL + +-[ RECORD 18 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:hal:in2p3-00414135v1 +base_url | http://hal.in2p3.fr/in2p3-00414135 +terminal_url | http://hal.in2p3.fr:80/in2p3-00414135 + +METADATA-ONLY + +-[ RECORD 19 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:aaltodoc.aalto.fi:123456789/13201 +base_url | https://aaltodoc.aalto.fi/handle/123456789/13201 +terminal_url | https://aaltodoc.aalto.fi/handle/123456789/13201 + +This specific record is not accessible. +Another: https://aaltodoc.aalto.fi/handle/123456789/38002 + +DSpace 5.4 + +Worked (from recent changes) + + +-[ RECORD 20 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:sedici.unlp.edu.ar:10915/40144 +base_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view +terminal_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view + +This is a journal! Cool. Plone software platform. + +FIXED + +## Top no-capture Domains + +Top terminal no-capture domains: + + SELECT domain, COUNT(domain) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_file_result.status = 'no-capture' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 30; + + domain | count + -----------------------------------+------- + digitalrepository.unm.edu | 94087 + escholarship.org | 80632 + ir.opt.ac.cn | 70504 + idus.us.es | 67908 + www.cambridge.org | 56376 + www.ssoar.info | 52534 + rep.bntu.by | 52127 + scholarworks.umt.edu | 48546 + publikationen.ub.uni-frankfurt.de | 46987 + dk.um.si | 45753 + repositorio.uladech.edu.pe | 37028 + uu.diva-portal.org | 34929 + digitalcommons.law.byu.edu | 31732 + sedici.unlp.edu.ar | 31233 + elib.sfu-kras.ru | 29131 + jyx.jyu.fi | 28144 + www.repository.cam.ac.uk | 27728 + nagoya.repo.nii.ac.jp | 26673 + www.duo.uio.no | 25258 + www.persee.fr | 24968 + www2.senado.leg.br | 24426 + tesis.ucsm.edu.pe | 24049 + digitalcommons.unl.edu | 21974 + www.degruyter.com | 21940 + www.igi-global.com | 20736 + thekeep.eiu.edu | 20712 + docs.lib.purdue.edu | 20538 + repositorio.cepal.org | 20280 + elib.bsu.by | 19620 + minds.wisconsin.edu | 19473 + (30 rows) + +These all seem worth crawling. A couple publishers (cambridge.org), and +persee.fr will probably fail, but not too many URLs. + +## Summary of Filtered Prefixes and Domains (OAI-PMH) + +oai:kb.dk: + too large and generic +oai:bdr.oai.bsb-muenchen.de: + too large and generic +oai:hispana.mcu.es: + too large and generic +oai:bnf.fr: + too large and generic +oai:ukm.si: + too large and generic +oai:biodiversitylibrary.org: + redundant with other ingest and archive.org content +oai:hsp.org: + large; historical content only +oai:repec: + large; mostly (entirely?) links to publisher sites +oai:n/a: + meta? +oai:quod.lib.umich.edu: + entire issues? hard to crawl so skip for now +oai:hypotheses.org: + HTML, not PDF +oai:americanae.aecid.es: + large, complex. skip for now +oai:www.irgrid.ac.cn: + aggregator of other IRs +oai:espace.library.uq.edu.au: + large; metadata only; javascript heavy (poor heritrix crawling) +oai:edoc.mpg.de: + deprecated domain, with no redirects +oai:bibliotecadigital.jcyl.es: + digitized historical docs; hard to crawl, skip for now +oai:repository.erciyes.edu.tr: + gone (domain lapsed) +oai:krm.or.kr: + "research results repository" (metadata only) + +www.kb.dk + large, general purpose, scope +kb-images.kb.dk + deprecated +mdz-nbn-resolving.de + multiple prefixes end up here. historical docs, scope +aggr.ukm.um.si + large, out of scope +edoc.mpg.de + deprecated domain +doaj.org + index (metadata only) +orcid.org + out of scope +gateway.isiknowledge.com + clarivate login/payall (skipping in ingest) + +Needs filtering to a subset of records (by 'set' or other filtering?): + +oai:igi.indrastra.com: +oai:invenio.nusl.cz: +oai:t2r2.star.titech.ac.jp: +oai:evastar-karlsruhe.de: +oai:repository.ust.hk: +oai:serval.unil.ch: +oai:pure.atira.dk: + +FIlters in SQL syntax: + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + +and in some contexts (PDFs; switch to HTML): + + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + +## Overall Summary of OAI-PMH Stuff + +Big picture is that the majority of `no-pdf-link` crawl status are because of +repository scope, record scope, or content format issues. That being said, +there was a sizable fraction of sites which were platforms (like DSpace) which +were not ingesting well. + +A significant fraction of records are "metadata only" (of papers), or non-paper +entity types (like persons, grants, or journal titles), and a growing fraction +(?) are metadata plus link to OA publisher fulltext (offsite). Might be +possible to detect these at ingest time, or earlier at OAI-PMH +harvest/transform time and filter them out. + +It may be worthwhile to attempt ingest of multiple existing captures +(timestamps) in the ingest pipeline. Eg, isntead of chosing a single "best" +capture, if therea are multiple HTTP 200 status captures, try ingest with each +(or at least a couple). This is because repository software gets upgraded, so +old "no-capture" or "not found" or "link loop" type captures may work when +recrawled. + +New summary with additional filters: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -----------------------+---------- + success | 12872279 + no-pdf-link | 9329602 + no-capture | 4696362 + redirect-loop | 1541458 + terminal-bad-status | 660418 + link-loop | 452831 + wrong-mimetype | 434868 + null-body | 71065 + cdx-error | 17005 + | 15275 + petabox-error | 12743 + wayback-error | 11759 + skip-url-blocklist | 182 + gateway-timeout | 122 + redirects-exceeded | 120 + bad-redirect | 117 + bad-gzip-encoding | 111 + wayback-content-error | 102 + timeout | 72 + blocked-cookie | 62 + (20 rows) + |