diff options
Diffstat (limited to 'notes')
36 files changed, 10319 insertions, 0 deletions
diff --git a/notes/dryad_datasets.md b/notes/dryad_datasets.md new file mode 100644 index 0000000..5c727b1 --- /dev/null +++ b/notes/dryad_datasets.md @@ -0,0 +1,17 @@ + +api docs: https://datadryad.org/api/v2/docs + +current search queries return 38,000 hits (December 2020) + +exmaple with multiple versions: + https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0 + https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0 + https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions + + +how to handle versions? DOI doesn't get incremented. + +on archive.org, could have separate item for each version, or sub-directories within item, one for each version + +in fatcat, could have a release for each version, but only one with +the DOI; or could have a separate fileset for each version diff --git a/notes/examples/2021-11-12_broken_grobid_xml.md b/notes/examples/2021-11-12_broken_grobid_xml.md new file mode 100644 index 0000000..5223651 --- /dev/null +++ b/notes/examples/2021-11-12_broken_grobid_xml.md @@ -0,0 +1,83 @@ + +Find all the PDFs from web which resulted in `bad-grobid-xml` status code (among others): + + sql> select * from grobid where status != 'success' and status_code != 500 and status_code != 503 and status != 'error-timeout' limit 100; + + sha1hex | updated | grobid_version | status_code | status | fatcat_release | metadata + ------------------------------------------+-------------------------------+----------------+-------------+----------------+----------------+------------------------------------------------------------------------ + d994efeea3b653e2dbe8e13e5a6d203e9b9484ab | 2020-03-20 04:04:40.093094+00 | | 200 | error | | {"error_msg": "response XML too large: 12052192 bytes"} + 8dadf846488ddc2ff3934dd6beee0e3046fa3800 | 2020-11-24 01:24:02.668692+00 | | 200 | error | | {"error_msg": "response XML too large: 18758248 bytes"} + 227900724e5cf9fbd06146c914239d0c12c3671a | 2020-03-18 10:24:33.394339+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"} + https://web.archive.org/web/20200210041053/https://pdfs.semanticscholar.org/2279/00724e5cf9fbd06146c914239d0c12c3671a.pdf + FIXED + f667b4ef2befb227078169ed57ffc6efc5fa85c2 | 2020-03-20 04:54:18.902756+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 527"} + https://web.archive.org/web/20200218182411/https://pdfs.semanticscholar.org/f667/b4ef2befb227078169ed57ffc6efc5fa85c2.pdf + FIXED + c1e8d9df347b8de53fc2116615b1343ba327040d | 2020-11-08 21:46:04.552442+00 | | 200 | bad-grobid-xml | | {"error_msg": "mismatched tag: line 198, column 3"} + https://web.archive.org/web/20200904163312/https://arxiv.org/pdf/1906.02107v1.pdf + FIXED (and good) + 4d9860a5eeee6bc671c3be859ca78f89669427f0 | 2021-11-04 01:29:13.081596+00 | | 200 | bad-grobid-xml | | {"error_msg": "unclosed token: line 812, column 7"} + https://web.archive.org/web/20211104012833/https://actabalneologica.eu/wp-content/uploads/library/ActaBalneol2021i3.pdf + FIXED + metadata quality mixed, but complex document (?) + 7cfc0739be9c49d94272110a0a748256bdde9be6 | 2021-07-25 17:06:03.919073+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 38, column 440"} + https://web.archive.org/web/20210716124436/https://jsesd.csers-ly.com/index.php/jsesd/article/download/28/23 + FIXED + 088c61a229084d13f85524efcc9f38a80dd19caf | 2021-09-01 08:08:18.531533+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 47, column 814"} + https://web.archive.org/web/20210814181328/https://wmrj.areeo.ac.ir/article_120843_3806466cb1f5a125c328f99866751a43.pdf + FIXED + 19e70297e523e9f32cd4379af33a12ab95c34a71 | 2021-11-05 10:09:25.407657+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 853, column 84"} + not found + acc855d74431537b98de5185e065e4eacbab7b26 | 2021-11-12 22:57:22.439007+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 60, column 45"} + https://web.archive.org/web/20211111182756/https://arxiv.org/pdf/2006.13365v5.pdf + BROKEN: not well-formed (invalid token): line 60, column 45 + <note type="raw_affiliation"><label>&</label> Fraunhofer IAIS, Sankt Augustin and Dresden, Germany.</note> + 8e73055c63d1e684b59059ac418f55690a2eec01 | 2021-11-12 17:34:46.343685+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 44, column 45"} + not found + c2b3f696e97b9e80f38c35aa282416e95d6d9f5e | 2021-11-12 22:57:12.417191+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 58, column 45"} + https://web.archive.org/web/20211112051714/https://ccsenet.org/journal/index.php/gjhs/article/download/0/0/46244/49308 + BROKEN: not well-formed (invalid token): line 58, column 45 + <note type="raw_affiliation"><label>&</label> Ren, 2020; Meng, Hua, & Bian, 2020).</note> + 840d4609308c4a7748393181fe1f6a45f9d425c5 | 2021-11-12 22:57:17.433022+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 1824, column 45"} + not found + 3deb6375e894c5007207502bf52d751a47a20725 | 2021-11-12 23:11:17.711948+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 65, column 45"} + not found + f1d06080a4b1ac72ab75226e692e8737667c29a7 | 2020-01-16 09:23:27.579995+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 29, column 1581"} + https://web.archive.org/web/20180721030918/https://journals.squ.edu.om/index.php/jams/article/download/650/649 + FIXED, good + f3e7b91fce9132addc59bd1560c5eb16c0330842 | 2020-01-12 11:58:06.654613+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"} + https://web.archive.org/web/20180426020051/http://jhsw.tums.ac.ir/article-1-5121-en.pdf + FIXED + 37edcaa6f67fbb8c3e27fa02da4f0fa780e33bca | 2020-01-04 21:53:49.578847+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 1284"} + https://web.archive.org/web/20180510115632/http://www.fmreview.org/sites/fmr/files/FMRdownloads/ar/detention/majidi.pdf + FIXED + 3f1d302143824808f7109032687a327708896748 | 2020-01-05 20:51:18.783034+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"} + https://web.archive.org/web/20180428082655/http://jhsw.tums.ac.ir/browse.php?a_id=5121&sid=1&slc_lang=fa&ftxt=1 + FIXED + (21 rows) + +Some other errors from other queries: + + d9634f194bc3dee27db7a1cb49b30e48803d7ad8 | 2020-01-06 16:01:09.331272+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/VyuJWqREHT.lxml"} + https://web.archive.org/web/20190304092121/http://pdfs.semanticscholar.org/d963/4f194bc3dee27db7a1cb49b30e48803d7ad8.pdf + FIXED: with 0.7.0+ + + 56c9b5398ef94df54d699342740956caf4523925 | 2020-02-06 21:37:42.139761+00 | | 500 | error | | {"error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1"} + https://web.archive.org/web/20080907000756/http://www.rpi.edu/~limc/poster_ding.pdf + still errors: "error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1", "status": "error", "status_code": 500 + BAD PDF ("no pages" in evince) + + d7cf65ed211cf1e3420c595fdbecc5d18f297b11 | 2020-01-10 23:19:16.783415+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/dBV73X4HrZ.lxml"} + https://web.archive.org/web/20170812074846/http://dspace.utpl.edu.ec/bitstream/123456789/7918/1/Tesis_de_Jacome_Valdivieso_Soraya_Stephan%c3%ada.pdf + FIXED + + 51d070ab398a8744286ef7356445f0828a9f3abb | 2020-02-06 16:01:23.98892+00 | | 503 | error | | {"error_msg": "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n<t + https://web.archive.org/web/20191113160818/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC2082155&blobtype=pdf + FIXED + +In summary, there are still a small number of `bad-grobid-xml` cases, and still +many "very large PDF" cases. But we should probably broadly retry everything, +especially the 503 errors (from when GROBID is simply down/unavailable). + +The `bad-grobid-xml` cases here were all from "<label>" in raw affiliations, +which I have submitted a patch/PR for. diff --git a/notes/examples/dataset_examples.txt b/notes/examples/dataset_examples.txt new file mode 100644 index 0000000..3a04750 --- /dev/null +++ b/notes/examples/dataset_examples.txt @@ -0,0 +1,52 @@ + +### ArchiveOrg: CAT dataset + +<https://archive.org/details/CAT_DATASET> + +`release_36vy7s5gtba67fmyxlmijpsaui` + +### + +<https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635> + +doi:10.1371/journal.pone.0120448 + +Single .rar file + +### Dataverse + +<https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B> + +Single excel file + +### Dataverse + +<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1> + +doi:10.7910/DVN/CLSFKX + +Mulitple files; multiple versions? + +API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1> + + .data.id + .data.latestVersion.datasetPersistentId + .data.latestVersion.versionNumber, .versionMinorNumber + .data.latestVersion.files[] + .dataFile + .contentType (mimetype) + .filename + .filesize (int, bytes) + .md5 + .persistendId + .description + .label (filename?) + .version + +Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> + +Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3) + +Dataverse refs: +- 'doi' and 'hdl' are the two persistentId styles +- file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled diff --git a/notes/examples/html_test_journals.txt b/notes/examples/html_test_journals.txt new file mode 100644 index 0000000..540dc9f --- /dev/null +++ b/notes/examples/html_test_journals.txt @@ -0,0 +1,153 @@ + +Good examples of journals to run HTML fulltext extraction on. + +## Live Web + +d-lib magazine + live web + no longer active + http://www.dlib.org/back.html + +NLM technical bulletin + https://www.nlm.nih.gov/pubs/techbull/back_issues.html + +Genders + https://web.archive.org/web/20141227010240/http://www.genders.org:80/index.html + +firstmondays + live web; now OJS + +outhistory.org + +http://journal.sjdm.org/ + +http://whoosh.org/ + + +## Vanished (but wayback coverage) + +ohmylittledata + issn:2551-1289 + vanished + blog format + http://web.archive.org/web/20180421061156/https://ohmylittledata.com/ + +exquisit corpse + https://web.archive.org/web/20080521052400/http://corpse.org:80/ + +Journal of Mundane Behavior + https://fatcat.wiki/container/tjwfvrjlunf25ofegccgjjmvya + ISSN: 1529-3041 + + defunct since ~2010 + simple HTML articles + references + http://web.archive.org/web/20100406162007/http:/mundanebehavior.org/index2.htm + http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm + +War Crimes + + PDF articles (not HTML) + http://web.archive.org/web/20120916035741/http:/www.war-crimes.org/ + + +## DOAJ Test Articles (HTML) + + zcat doaj_article_data_2020-08-07.json.gz | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.txt + => 2,184,954 + + cut -f3 -d/ html_fulltext_urls.txt | sort | uniq -c | sort -nr | head -n25 + 254817 link.springer.com + 145159 www.scielo.br + 78044 journal.frontiersin.org + 77394 www.frontiersin.org + 40849 www.dovepress.com + 19024 dergipark.org.tr + 18758 periodicos.ufsc.br + 16346 www.revistas.usp.br + 15872 revistas.unal.edu.co + 15527 revistas.ucm.es + 13669 revistas.usal.es + 12640 dergipark.gov.tr + 12111 journals.rudn.ru + 11839 www.scielosp.org + 11277 www.karger.com + 10827 www.journals.vu.lt + 10318 + 9854 peerj.com + 9100 ojs.unud.ac.id + 8581 jurnal.ugm.ac.id + 8261 riviste.unimi.it + 8012 journals.uran.ua + 7454 revistas.pucp.edu.pe + 7264 journals.vgtu.lt + 7200 publicaciones.banrepcultural.org + + cat html_fulltext_urls.txt \ + | rg -v link.springer.com \ + | rg -v scielo \ + | rg -v dergipark.gov.tr \ + | rg -v frontiersin.org \ + > html_fulltext_urls.filtered.txt + => 1,579,257 + + zcat doaj_article_data_2020-08-07.json.gz | rg -v '"doi"' | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.no_doi.txt + => 560k + + cut -f3 -d/ html_fulltext_urls.no_doi.txt | sort | uniq -c | sort -nr | head -n25 + 40849 www.dovepress.com + 10570 journals.rudn.ru + 10494 dergipark.org.tr + 10233 revistas.unal.edu.co + 9981 dergipark.gov.tr + 9428 revistas.usal.es + 8292 revistas.ucm.es + 7200 publicaciones.banrepcultural.org + 6953 revistas.pucp.edu.pe + 6000 www.scielosp.org + 5962 www.scielo.br + 5621 www.richtmann.org + 5123 scielo.sld.cu + 5067 ojs.unud.ac.id + 4838 periodicos.ufsc.br + 4736 revistasonlinepre.inap.es + 4486 journal.fi + 4221 www.seer.ufu.br + 3553 revistas.uam.es + 3492 revistas.pucsp.br + 3060 www.scielo.org.co + 2991 scielo.isciii.es + 2802 seer.ufrgs.br + 2692 revistas.unc.edu.ar + 2685 srl.si + + cat html_fulltext_urls.no_doi.txt \ + | rg -v link.springer.com \ + | rg -v scielo \ + | rg -v dergipark.gov.tr \ + | rg -v frontiersin.org \ + > html_fulltext_urls.no_doi.filtered.txt + => 518,608 + + zcat doaj_articles_2020-08-07.html_fulltext_urls.no_doi.filtered.txt.gz | shuf -n20 + https://revistas.unc.edu.ar/index.php/revistaEF/article/view/22795 + https://journal.umy.ac.id/index.php/st/article/view/3297 + https://www.unav.edu/publicaciones/revistas/index.php/estudios-sobre-educacion/article/view/23442 + http://publications.muet.edu.pk/research_papers/pdf/pdf1615.pdf + http://revistas.uncu.edu.ar/ojs/index.php/revistaestudiosclasicos/article/view/1440 + https://journal.fi/inf/article/view/59430 + http://journal.uii.ac.id/index.php/Eksakta/article/view/2429 + https://www.dovepress.com/infant-sleep-and-its-relation-with-cognition-and-growth-a-narrative-re-peer-reviewed-article-NSS + https://revistasonlinepre.inap.es/index.php/REALA/article/view/9157 + http://dergipark.org.tr/dubited/issue/27453/299047?publisher=duzce + http://revistas.pucp.edu.pe/index.php/themis/article/view/11862 + http://journal.bdfish.org/index.php/fisheries/article/view/91 + https://ojs.unud.ac.id/index.php/buletinfisika/article/view/30567 + https://www.lithosphere.ru/jour/article/view/779 + https://journals.hioa.no/index.php/seminar/article/view/2412 + http://revistas.unicauca.edu.co/index.php/rfcs/article/view/197 + https://www.kmuj.kmu.edu.pk/article/view/15698 + http://forodeeducacion.com/ojs/index.php/fde/article/view/82 + https://revistas.unc.edu.ar/index.php/ConCienciaSocial/article/view/19941 + http://grbs.library.duke.edu/article/view/3361 + diff --git a/notes/examples/random_datasets.md b/notes/examples/random_datasets.md new file mode 100644 index 0000000..b69132c --- /dev/null +++ b/notes/examples/random_datasets.md @@ -0,0 +1,19 @@ + +Possible external datasets to ingest (which are not entire platforms): + +- https://research.google/tools/datasets/ +- https://openslr.org/index.html +- https://www.kaggle.com/datasets?sort=votes&tasks=true +- https://archive.ics.uci.edu/ml/datasets.php + +Existing archive.org datasets to ingest: + +- https://archive.org/details/allthemusicllc-datasets + +Papers on archive.org to ingest: + +- <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=> +- <https://archive.org/details/biorxiv> +- <https://archive.org/details/philosophicaltransactions?tab=collection> +- <https://archive.org/search.php?query=doi%3A%2A> +- <https://archive.org/details/folkscanomy_academic> diff --git a/notes/ingest/2020-11-04_arxiv.md b/notes/ingest/2020-11-04_arxiv.md new file mode 100644 index 0000000..f9abe09 --- /dev/null +++ b/notes/ingest/2020-11-04_arxiv.md @@ -0,0 +1,12 @@ + +Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run +a crawl. + +Crawl is now done, so going to ingest, hoping to get the majority of the +millions of remaining arxiv.org PDFs. + + zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l + => 1,288,559 + + zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2020-12-08_patch_crawl_notes.md b/notes/ingest/2020-12-08_patch_crawl_notes.md new file mode 100644 index 0000000..5979753 --- /dev/null +++ b/notes/ingest/2020-12-08_patch_crawl_notes.md @@ -0,0 +1,111 @@ + +Notes here about re-ingesting or re-crawling large batches. Goal around end of +2020 is to generate a broad patch crawl of terminal no-capture attempts for all +major sources crawled thus far. Have already tried run this process for unpaywall. + +For each, want filtered ingest request JSON objects (filtering out platforms +that don't crawl well, and possibly things like figshare+zenodo), and a broader +seedlist (including terminal URLs). Will de-dupe all the seedlist URLs and do a +heritrix crawl with new config, then re-ingest all the requests individually. + +Summary of what to do here: + + OA DOI: expecting some 2.4 million seeds + OAI-PMH: expecting some 5 million no-capture URLs, plus more from missing PDF URL not found + Unpaywall: another ~900k no-capture URLs (maybe filtered?) + +For all, re-attempt for these status codes: + + no-capture + cdx-error + wayback-error + petabox-error + gateway-timeout (?) + +And at least do bulk re-ingest for these, if updated before 2020-11-20 or so: + + no-pdf-link + +## OAI-PMH + +Need to re-ingest all of the (many!) no-capture and no-pdf-link + +TODO: repec-specific URL extraction? + +Skip these OAI prefixes: + + kb.dk + bnf.fr + hispana.mcu.es + bdr.oai.bsb-muenchen.de + ukm.si + hsp.org + +Skip these domains: + + www.kb.dk (kb.dk) + kb-images.kb.dk (kb.dk) + mdz-nbn-resolving.de (TODO: what prefix?) + aggr.ukm.um.si (ukm.si) + +Check PDF link extraction for these prefixes, or skip them (TODO): + + repec (mixed success) + biodiversitylibrary.org + juser.fz-juelich.de + americanae.aecid.es + www.irgrid.ac.cn + hal + espace.library.uq.edu.au + igi.indrastra.com + invenio.nusl.cz + hypotheses.org + t2r2.star.titech.ac.jp + quod.lib.umich.edu + + domain: hemerotecadigital.bne.es + domain: bib-pubdb1.desy.de + domain: publikationen.bibliothek.kit.edu + domain: edoc.mpg.de + domain: bibliotecadigital.jcyl.es + domain: lup.lub.lu.se + domain: orbi.uliege.be + +TODO: +- consider deleting ingest requests from skipped prefixes (large database use) + + +## Unpaywall + +About 900k `no-pdf-link`, and up to 2.5 million more `no-pdf-link`. + +Re-bulk-ingest filtered requests which hit `no-pdf-link` before 2020-11-20: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) < '2020-11-20' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + ) TO '/grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json'; + => COPY 1309990 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_nopdflink_2020-12-08.ingest_request.json + => 1.31M 0:00:51 [25.6k/s] + + cat /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2021-04_unpaywall.md b/notes/ingest/2021-04_unpaywall.md new file mode 100644 index 0000000..d7643f4 --- /dev/null +++ b/notes/ingest/2021-04_unpaywall.md @@ -0,0 +1,368 @@ + +New snapshot released 2021-02-18, finally getting around to a crawl two months +later. + +Intend to do same style of crawl as in the past. One change is that +sandcrawler-db has moved to a focal VM. + + +## Transform and Load + + # in sandcrawler pipenv on sandcrawler1-vm (svc506) + zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18T160139.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json + => 30.0M 3:14:59 [2.57k/s] + + cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 30027007, 'insert-requests': 2703999, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 30027007, 'pushed': 30027007}) + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2021-01-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json'; + => COPY 3277484 + + # previous, 2020-10 run: COPY 4216339 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json + => 3.28M 0:01:42 [32.1k/s] + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+---------- + success | 26385866 + no-pdf-link | 2132565 + no-capture | 2092111 + redirect-loop | 1732543 + terminal-bad-status | 1504555 + wayback-content-error | 357345 + wrong-mimetype | 126070 + link-loop | 76808 + cdx-error | 22756 + null-body | 22066 + wayback-error | 13768 + gateway-timeout | 3804 + petabox-error | 3608 + spn2-cdx-lookup-failure | 1225 + redirects-exceeded | 892 + invalid-host-resolution | 505 + bad-redirect | 151 + spn2-error | 108 + spn2-error:job-failed | 91 + bad-gzip-encoding | 27 + (20 rows) + +Only the recent bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-01-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 1348623 + no-capture | 1231582 + redirect-loop | 45622 + no-pdf-link | 37312 + terminal-bad-status | 24162 + wrong-mimetype | 6684 + link-loop | 5757 + null-body | 1288 + wayback-content-error | 1123 + cdx-error | 831 + petabox-error | 697 + wayback-error | 185 + invalid-host-resolution | 41 + gateway-timeout | 29 + blocked-cookie | 22 + bad-gzip-encoding | 20 + spn2-cdx-lookup-failure | 7 + bad-redirect | 4 + timeout | 3 + redirects-exceeded | 3 + (20 rows) + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%.archive.org%' + AND ingest_request.base_url NOT LIKE '%://archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json'; + => 2020-10: 2,936,404 + => 2021-04: 1,805,192 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json + => 1.81M 0:01:27 [20.6k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt + + wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.*.txt + 6 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt + 1668524 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt + 1685717 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt + +## Post-Crawl Bulk Ingest + + cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => 1,804,211 consumer group lag + +## Post-Ingest Stats + +Overall status (unpaywall, all time): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 27242251 + no-pdf-link | 2746237 + redirect-loop | 1821132 + terminal-bad-status | 1553441 + no-capture | 478559 + wayback-content-error | 357390 + wrong-mimetype | 127365 + link-loop | 79389 + cdx-error | 23170 + null-body | 23169 + wayback-error | 13704 + gateway-timeout | 3803 + petabox-error | 3642 + redirects-exceeded | 1427 + spn2-cdx-lookup-failure | 1214 + invalid-host-resolution | 505 + bad-redirect | 153 + spn2-error | 107 + spn2-error:job-failed | 91 + body-too-large | 84 + (20 rows) + +Ingest stats broken down by publication stage: + + SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY release_stage, status + ORDER BY release_stage, COUNT DESC + LIMIT 100; + + release_stage | status | count + ---------------+-------------------------------------+---------- + accepted | success | 1213335 + accepted | no-pdf-link | 29292 + accepted | redirect-loop | 12769 + accepted | terminal-bad-status | 11264 + accepted | no-capture | 10187 + accepted | cdx-error | 1015 + accepted | wayback-content-error | 757 + accepted | wrong-mimetype | 501 + accepted | link-loop | 407 + accepted | wayback-error | 207 + accepted | petabox-error | 189 + accepted | redirects-exceeded | 125 + accepted | null-body | 34 + accepted | spn2-cdx-lookup-failure | 5 + accepted | gateway-timeout | 4 + accepted | blocked-cookie | 2 + accepted | bad-redirect | 1 + accepted | body-too-large | 1 + published | success | 20196774 + published | no-pdf-link | 2647969 + published | redirect-loop | 1477558 + published | terminal-bad-status | 1320013 + published | wayback-content-error | 351931 + published | no-capture | 297603 + published | wrong-mimetype | 115440 + published | link-loop | 76431 + published | cdx-error | 18125 + published | null-body | 17559 + published | wayback-error | 10466 + published | petabox-error | 2684 + published | gateway-timeout | 1979 + published | redirects-exceeded | 947 + published | spn2-cdx-lookup-failure | 877 + published | invalid-host-resolution | 457 + published | bad-redirect | 120 + published | spn2-error:job-failed | 77 + published | spn2-error | 70 + published | body-too-large | 39 + published | bad-gzip-encoding | 24 + published | timeout | 24 + published | blocked-cookie | 23 + published | spn2-error:soft-time-limit-exceeded | 4 + published | | 2 + published | pending | 1 + published | spn2-error:pending | 1 + published | too-many-redirects | 1 + submitted | success | 5832117 + submitted | redirect-loop | 330785 + submitted | terminal-bad-status | 222152 + submitted | no-capture | 170766 + submitted | no-pdf-link | 68934 + submitted | wrong-mimetype | 11424 + submitted | null-body | 5576 + submitted | wayback-content-error | 4702 + submitted | cdx-error | 4030 + submitted | wayback-error | 3031 + submitted | link-loop | 2551 + submitted | gateway-timeout | 1820 + submitted | petabox-error | 769 + submitted | redirects-exceeded | 355 + submitted | spn2-cdx-lookup-failure | 332 + submitted | invalid-host-resolution | 48 + submitted | body-too-large | 44 + submitted | spn2-error | 37 + submitted | bad-redirect | 32 + submitted | spn2-error:job-failed | 14 + submitted | | 13 + submitted | spn2-error:soft-time-limit-exceeded | 5 + submitted | timeout | 4 + submitted | bad-gzip-encoding | 3 + submitted | skip-url-blocklist | 1 + | no-pdf-link | 42 + | success | 25 + | redirect-loop | 20 + | terminal-bad-status | 12 + | no-capture | 3 + (76 rows) + + +Only the recent updates: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-04-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 2192376 + no-capture | 152183 + no-pdf-link | 144174 + redirect-loop | 125988 + terminal-bad-status | 67307 + link-loop | 8292 + wrong-mimetype | 7942 + null-body | 2270 + cdx-error | 1223 + wayback-content-error | 1147 + petabox-error | 728 + wayback-error | 155 + body-too-large | 82 + invalid-host-resolution | 41 + gateway-timeout | 28 + blocked-cookie | 22 + bad-gzip-encoding | 20 + timeout | 7 + bad-redirect | 6 + redirects-exceeded | 4 + (20 rows) + +In total, this iteration of unpaywall ingest resulted in: + +- 2,703,999 raw ingest requests (new URLs total) +- 1,231,582 (45.5%) of these had not been seen/crawled from any source yet +- 843,753 (31.2%) success from new heritrix crawling +- 2,192,376 (81.1%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success) diff --git a/notes/ingest/2021-05_daily_improvements.md b/notes/ingest/2021-05_daily_improvements.md new file mode 100644 index 0000000..e8748fa --- /dev/null +++ b/notes/ingest/2021-05_daily_improvements.md @@ -0,0 +1,480 @@ + +Summary of top large broken domains (2021-04-21 "30 day" snapshot): + +## acervus.unicamp.br + + domain | status | count +---------------------------------------+-------------------------+-------- + acervus.unicamp.br | | 1967 + acervus.unicamp.br | no-pdf-link | 1853 + +select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%acervus.unicamp.br%' and status = 'no-pdf-link' limit 5; + +http://acervus.unicamp.br/index.asp?codigo_sophia=963332 + +seems like many of these were captures with a blank page? or a redirect to +the homepage? + +http://web.archive.org/web/20200129110523/http://acervus.unicamp.br/index.html + +messy, going to move on. + + +## apex.ipk-gatersleben.de + +apex.ipk-gatersleben.de | | 1253 +apex.ipk-gatersleben.de | no-pdf-link | 1132 + +select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%apex.ipk-gatersleben.de%' and status = 'no-pdf-link' limit 5; + +https://doi.org/10.25642/ipk/rescoll/4886 +https://apex.ipk-gatersleben.de/apex/f?p=PGRDOI:RESOLVE:::NO:RP:DOI:10.25642/IPK/RESCOLL/7331 + +seem to be datasets/species, not articles. + +prefix: 10.25642/ipk + +## crossref.org + + apps.crossref.org | | 4693 + apps.crossref.org | no-pdf-link | 4075 + +https://doi.org/10.1515/9781501747045-013 +https://apps.crossref.org/coaccess/coaccess.html?doi=10.1515%2F9781501747045-013 + +Derp, they are doing a dynamic/AJAX thing, so access links are not in the HTML. + +## openeditiong + + books.openedition.org | | 1784 + books.openedition.org | no-pdf-link | 1466 + +https://doi.org/10.4000/books.pul.34492 +https://books.openedition.org/pul/34492 + +these are not actually OA books (or at least, not all are) + +## chemrxiv.org (figshare) + + chemrxiv.org | | 857 + chemrxiv.org | no-pdf-link | 519 + +https://doi.org/10.26434/chemrxiv.14411081 +https://chemrxiv.org/articles/preprint/Prediction_and_Optimization_of_Ion_Transport_Characteristics_in_Nanoparticle-Based_Electrolytes_Using_Convolutional_Neural_Networks/14411081 + +these all seem to be *multi-file* entities, thus not good for single file ingest pipeline. + +## direct.mit.edu + + direct.mit.edu | | 996 + direct.mit.edu | no-pdf-link | 869 + +https://doi.org/10.7551/mitpress/14056.003.0004 +https://direct.mit.edu/books/monograph/5111/chapter-abstract/3060134/Adding-Technology-to-Contact-Tracing?redirectedFrom=fulltext + +"not available" + +https://doi.org/10.7551/mitpress/12444.003.0004 + +"not available" + + +## dlc.library.columbia.edu + + dlc.library.columbia.edu | | 4225 + dlc.library.columbia.edu | no-pdf-link | 2395 + dlc.library.columbia.edu | spn2-wayback-error | 1568 + +https://doi.org/10.7916/d8-506w-kk49 +https://dlc.library.columbia.edu/durst/cul:18931zcrk9 + +document repository. +this one goes to IA! actually many seem to. +added extractor, should re-ingest with: + + publisher:"Columbia University" doi_prefix:10.7916 !journal:* + +actually, that is like 600k+ results and many are not digitized, so perhaps not. + +## doi.ala.org.au + + doi.ala.org.au | | 2570 + doi.ala.org.au | no-pdf-link | 2153 + +https://doi.org/10.26197/ala.811d55e3-2ff4-4501-b3e7-e19249507052 +https://doi.ala.org.au/doi/811d55e3-2ff4-4501-b3e7-e19249507052 + +this is a data repository, with filesets, not papers. datacite metadata is +incorrect. + +## fldeploc.dep.state.fl.us + + fldeploc.dep.state.fl.us | | 774 + fldeploc.dep.state.fl.us | no-pdf-link | 718 + + +https://doi.org/10.35256/ic29 +http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29 + +re-ingest with: + + # only ~800 works + doi_prefix:10.35256 publisher:Florida + +## geoscan.nrcan.gc.ca + + geoscan.nrcan.gc.ca | | 2056 + geoscan.nrcan.gc.ca | no-pdf-link | 2019 + +https://doi.org/10.4095/295366 +https://geoscan.nrcan.gc.ca/starweb/geoscan/servlet.starweb?path=geoscan/fulle.web&search1=R=295366 + +this is a geographic repository, not papers. + +## kiss.kstudy.com + + kiss.kstudy.com | | 747 + kiss.kstudy.com | no-pdf-link | 686 + +https://doi.org/10.22143/hss21.12.1.121 +http://kiss.kstudy.com/thesis/thesis-view.asp?key=3862523 + +Korean. seems to not actually be theses? can't download. + +## linkinghub.elsevier.com + + linkinghub.elsevier.com | | 5079 + linkinghub.elsevier.com | forbidden | 2226 + linkinghub.elsevier.com | spn2-wayback-error | 1625 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758 + +skipping for now, looks like mostly 'forbidden'? + +## osf.io + +These are important! + + osf.io | | 3139 + osf.io | not-found | 2288 + osf.io | spn2-wayback-error | 582 + +https://doi.org/10.31219/osf.io/jux3w +https://accounts.osf.io/login?service=https://osf.io/jux3w/download + +many of these are 404s by browser as well. what does that mean? + +## peerj.com + + peerj.com | | 785 + peerj.com | no-pdf-link | 552 + +https://doi.org/10.7287/peerj.11155v0.1/reviews/2 +https://peerj.com/articles/11155/reviews/ + +these are HTML reviews, not papers + +## preprints.jmir.org + + preprints.jmir.org | | 763 + preprints.jmir.org | no-pdf-link | 611 + +https://doi.org/10.2196/preprints.22556 +https://preprints.jmir.org/preprint/22556 + +UGH, looks simple, but javascript. + +could try to re-write URL into S3 format? meh. + +## psyarxiv.com (OSF?) + + psyarxiv.com | | 641 + psyarxiv.com | no-pdf-link | 546 + +https://doi.org/10.31234/osf.io/5jaqg +https://psyarxiv.com/5jaqg/ + +Also infuriatingly Javascript, but can do URL hack. + +Should reingest, and potentially force-recrawl: + + # about 67k + publisher:"Center for Open Science" in_ia:false + +## publons.com + + publons.com | | 6998 + publons.com | no-pdf-link | 6982 + +https://doi.org/10.1002/jmor.21338/v2/review1 +https://publons.com/publon/40260824/ + +These are just HTML reviews, not papers. + +## saemobilus.sae.org + + saemobilus.sae.org | | 795 + saemobilus.sae.org | no-pdf-link | 669 + +https://doi.org/10.4271/as1426c +https://saemobilus.sae.org/content/as1426c + +These seem to be standards, and are not open access (paywall) + +## scholar.dkyobobook.co.kr + + scholar.dkyobobook.co.kr | | 1043 + scholar.dkyobobook.co.kr | no-pdf-link | 915 + +https://doi.org/10.22471/crisis.2021.6.1.18 +http://scholar.dkyobobook.co.kr/searchDetail.laf?barcode=4010028199536 + +Korean. complex javascript, skipping. + +## unreserved.rba.gov.au + + unreserved.rba.gov.au | | 823 + unreserved.rba.gov.au | no-pdf-link | 821 + +https://doi.org/10.47688/rba_archives_2006/04129 +https://unreserved.rba.gov.au/users/login + +Don't need to login when I tried in browser? document repo, not papers. + +## wayf.switch.ch + + wayf.switch.ch | | 1169 + wayf.switch.ch | no-pdf-link | 809 + +https://doi.org/10.24451/arbor.11128 +https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Farbor.bfh.ch%2Fshibboleth&return=https%3A%2F%2Farbor.bfh.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A5056fc0a97aeab16e5007ca63bede254cb5669d94173064d6c74c62a0f88b022 + +Loginwall + +## + + www.bloomsburycollections.com | | 1745 + www.bloomsburycollections.com | no-pdf-link | 1571 + +https://doi.org/10.5040/9781849664264.0008 +https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries + +These are primarily not OA/available. + +## + + www.emc2020.eu | | 791 + www.emc2020.eu | no-pdf-link | 748 + +https://doi.org/10.22443/rms.emc2020.146 +https://www.emc2020.eu/abstract/evaluation-of-different-rectangular-scan-strategies-for-hrstem-imaging.html + +These are just abstracts, not papers. + +## Emerald + + www.emerald.com | | 2420 + www.emerald.com | no-pdf-link | 1986 + +https://doi.org/10.1108/ramj-11-2020-0065 +https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html + +Note that these URLs are already HTML fulltext. but the PDF is also available and easy. + +re-ingest: + + # only ~3k or so missing + doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true + +## + + www.humankineticslibrary.com | | 1122 + www.humankineticslibrary.com | no-pdf-link | 985 + +https://doi.org/10.5040/9781718206625.ch-002 +https://www.humankineticslibrary.com/encyclopedia-chapter?docid=b-9781718206625&tocid=b-9781718206625-chapter2 + +paywall + +## + + www.inderscience.com | | 1532 + www.inderscience.com | no-pdf-link | 1217 + +https://doi.org/10.1504/ijdmb.2020.10036342 +https://www.inderscience.com/info/ingeneral/forthcoming.php?jcode=ijdmb + +paywall + +## + + www.ingentaconnect.com | | 885 + www.ingentaconnect.com | no-pdf-link | 783 + +https://doi.org/10.15258/sst.2021.49.1.07 +https://www.ingentaconnect.com/content/ista/sst/pre-prints/content-7_sst.2021.49.1_63-71;jsessionid=1joc5mmi1juht.x-ic-live-02 + +Annoying javascript, but easy to work around. + +re-ingest: + + # only a couple hundred; also re-ingest + doi_prefix:10.15258 in_ia:false year:>2018 + +## + + www.nomos-elibrary.de | | 2235 + www.nomos-elibrary.de | no-pdf-link | 1128 + www.nomos-elibrary.de | spn2-wayback-error | 559 + +https://doi.org/10.5771/9783748907084-439 +https://www.nomos-elibrary.de/10.5771/9783748907084-439/verzeichnis-der-autorinnen-und-autoren + +Javascript obfuscated download button? + +## + + www.oecd-ilibrary.org | | 3046 + www.oecd-ilibrary.org | no-pdf-link | 2869 + +https://doi.org/10.1787/543e84ed-en +https://www.oecd-ilibrary.org/development/applying-evaluation-criteria-thoughtfully_543e84ed-en + +Paywall. + +## + + www.osapublishing.org | | 821 + www.osapublishing.org | no-pdf-link | 615 + +https://doi.org/10.1364/boe.422199 +https://www.osapublishing.org/boe/abstract.cfm?doi=10.1364/BOE.422199 + +Some of these are "pre-registered" DOIs, not published yet. Many of the +remaining are actually HTML articles, and/or have some stuff in the +`citation_pdf_url`. A core problem is captchas. + +Have started adding support to fatcat for HTML crawl type based on container. + +re-ingest: + + container_twtpsm6ytje3nhuqfu3pa7ca7u (optica) + container_cg4vcsfty5dfvgmat5wm62wgie (optics express) + +## + + www.oxfordscholarlyeditions.com | | 759 + www.oxfordscholarlyeditions.com | no-pdf-link | 719 + +https://doi.org/10.1093/oseo/instance.00266789 +https://www.oxfordscholarlyeditions.com/view/10.1093/actrade/9780199593668.book.1/actrade-9780199593668-div1-27 + +loginwall/paywall + +## + + www.schweizerbart.de | | 730 + www.schweizerbart.de | no-pdf-link | 653 + +https://doi.org/10.1127/zfg/40/1996/461 +https://www.schweizerbart.de/papers/zfg/detail/40/97757/Theoretical_model_of_surface_karstic_processes?af=crossref + +paywall + +## + + www.sciencedirect.com | | 14757 + www.sciencedirect.com | no-pdf-link | 12733 + www.sciencedirect.com | spn2-wayback-error | 1503 + +https://doi.org/10.1016/j.landurbplan.2021.104104 +https://www.sciencedirect.com/science/article/pii/S0169204621000670 + +Bunch of crazy new hacks, but seems to be working! + +re-ingest: + + # to start! about 50k + doi_prefix:10.1016 is_oa:true year:2021 + +## + + www.sciendo.com | | 1955 + www.sciendo.com | no-pdf-link | 1176 + +https://doi.org/10.2478/awutm-2019-0012 +https://www.sciendo.com/article/10.2478/awutm-2019-0012 + +uses lots of javascript, hard to scrape. + + +## Others (for reference) + + | | 725990 + | no-pdf-link | 209933 + | success | 206134 + | spn2-wayback-error | 127015 + | spn2-cdx-lookup-failure | 53384 + | blocked-cookie | 35867 + | link-loop | 25834 + | too-many-redirects | 16430 + | redirect-loop | 14648 + | forbidden | 13794 + | terminal-bad-status | 8055 + | not-found | 6399 + | remote-server-error | 2402 + | wrong-mimetype | 2011 + | spn2-error:unauthorized | 912 + | bad-redirect | 555 + | read-timeout | 530 + +## Re-ingests + +All the above combined: + + container_twtpsm6ytje3nhuqfu3pa7ca7u (optica) + container_cg4vcsfty5dfvgmat5wm62wgie (optics express) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id twtpsm6ytje3nhuqfu3pa7ca7u + => Counter({'ingest_request': 1142, 'elasticsearch_release': 1142, 'estimate': 1142, 'kafka': 1142}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id cg4vcsfty5dfvgmat5wm62wgie + => Counter({'elasticsearch_release': 33482, 'estimate': 33482, 'ingest_request': 32864, 'kafka': 32864}) + + # only ~800 works + doi_prefix:10.35256 publisher:Florida + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query "doi_prefix:10.35256 publisher:Florida" + => Counter({'ingest_request': 843, 'elasticsearch_release': 843, 'estimate': 843, 'kafka': 843}) + + # only ~3k or so missing + doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1108 publisher:emerald" + => Counter({'ingest_request': 3812, 'elasticsearch_release': 3812, 'estimate': 3812, 'kafka': 3812}) + + + # only a couple hundred; also re-ingest + doi_prefix:10.15258 in_ia:false year:>2018 + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl query "doi_prefix:10.15258 year:>2018" + => Counter({'ingest_request': 140, 'elasticsearch_release': 140, 'estimate': 140, 'kafka': 140}) + + # to start! about 50k + doi_prefix:10.1016 is_oa:true year:2020 + doi_prefix:10.1016 is_oa:true year:2021 + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2020" + => Counter({'ingest_request': 75936, 'elasticsearch_release': 75936, 'estimate': 75936, 'kafka': 75936}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2021" + => Counter({'ingest_request': 54824, 'elasticsearch_release': 54824, 'estimate': 54824, 'kafka': 54824}) + + pmcid:* year:2018 + pmcid:* year:2019 + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2018" + => Counter({'ingest_request': 25366, 'elasticsearch_release': 25366, 'estimate': 25366, 'kafka': 25366}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2019" + => Counter({'ingest_request': 55658, 'elasticsearch_release': 55658, 'estimate': 55658, 'kafka': 55658}) + diff --git a/notes/ingest/2021-07_unpaywall.md b/notes/ingest/2021-07_unpaywall.md new file mode 100644 index 0000000..8b6ac09 --- /dev/null +++ b/notes/ingest/2021-07_unpaywall.md @@ -0,0 +1,320 @@ + +New snapshot released 2021-07-02. Should be "boring" ingest and crawl. + + +## Transform and Load + + # in sandcrawler pipenv on sandcrawler1-vm (svc506) + zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02T151134.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json + => 32.2M 3:01:52 [2.95k/s] + + cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 32196260, 'insert-requests': 3325954, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 32196260, 'pushed': 32196260}) + + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2021-01-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json'; + => COPY 3556146 + + # previous, 2020-10 run: COPY 4216339 + # previous, 2021-07 run: COPY 3277484 + +Oops, should have run instead, with the date filter: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json'; + +But didn't, so processed all instead. + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json + => 3.56M 0:01:59 [29.8k/s] + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done, on 2021-07-13 + + +## Check Pre-Crawl Status + +Only the recent bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + no-capture | 1831827 + success | 1343604 + redirect-loop | 103999 + terminal-bad-status | 19845 + no-pdf-link | 17448 + link-loop | 5027 + wrong-mimetype | 2270 + cdx-error | 523 + body-too-large | 321 + null-body | 298 + wayback-content-error | 242 + petabox-error | 155 + gateway-timeout | 138 + invalid-host-resolution | 120 + wayback-error | 109 + blocked-cookie | 9 + timeout | 7 + | 3 + bad-redirect | 3 + spn2-cdx-lookup-failure | 3 + (20 rows) + + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND date(ingest_request.created) > '2021-07-01' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%.archive.org%' + AND ingest_request.base_url NOT LIKE '%://archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json'; + => COPY 1743186 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json + => 1.74M 0:01:33 [18.6k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt + + wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.*.txt + 1 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt + 1643963 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt + 1644028 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt + 3287992 total + +Then run crawl (see `journal-crawls` git repo). + +## Post-Crawl Bulk Ingest + + cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => 1.74M 0:01:59 [14.6k/s] + +## Post-Ingest Stats + +Only the recent updates: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 2690258 + redirect-loop | 227328 + no-capture | 157368 + terminal-bad-status | 118943 + no-pdf-link | 92698 + blocked-cookie | 19478 + link-loop | 9249 + wrong-mimetype | 4918 + cdx-error | 1786 + wayback-error | 1497 + null-body | 1302 + body-too-large | 433 + wayback-content-error | 245 + petabox-error | 171 + gateway-timeout | 138 + invalid-host-resolution | 120 + timeout | 12 + bad-redirect | 4 + | 3 + spn2-cdx-lookup-failure | 1 + (20 rows) + +Only the recent updates, by publication stage: + + SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + GROUP BY release_stage, status + ORDER BY release_stage, COUNT DESC + LIMIT 100; + + release_stage | status | count + ---------------+-------------------------+--------- + accepted | success | 103144 + accepted | no-pdf-link | 53981 + accepted | terminal-bad-status | 4102 + accepted | link-loop | 2799 + accepted | no-capture | 2315 + accepted | redirect-loop | 2171 + accepted | blocked-cookie | 234 + accepted | cdx-error | 140 + accepted | wayback-error | 101 + accepted | wrong-mimetype | 38 + accepted | null-body | 10 + accepted | petabox-error | 5 + accepted | wayback-content-error | 4 + accepted | gateway-timeout | 2 + accepted | body-too-large | 2 + published | success | 1919100 + published | no-capture | 130104 + published | redirect-loop | 127482 + published | terminal-bad-status | 43118 + published | no-pdf-link | 33505 + published | blocked-cookie | 19034 + published | link-loop | 6241 + published | wrong-mimetype | 4163 + published | null-body | 1195 + published | cdx-error | 1151 + published | wayback-error | 1105 + published | wayback-content-error | 197 + published | body-too-large | 195 + published | petabox-error | 118 + published | gateway-timeout | 35 + published | invalid-host-resolution | 13 + published | timeout | 8 + published | bad-redirect | 2 + published | spn2-cdx-lookup-failure | 1 + published | bad-gzip-encoding | 1 + submitted | success | 668014 + submitted | redirect-loop | 97675 + submitted | terminal-bad-status | 71723 + submitted | no-capture | 24949 + submitted | no-pdf-link | 5212 + submitted | wrong-mimetype | 717 + submitted | cdx-error | 495 + submitted | wayback-error | 291 + submitted | body-too-large | 236 + submitted | blocked-cookie | 210 + submitted | link-loop | 209 + submitted | invalid-host-resolution | 107 + submitted | gateway-timeout | 101 + submitted | null-body | 97 + submitted | petabox-error | 48 + submitted | wayback-content-error | 44 + submitted | timeout | 4 + submitted | | 3 + submitted | bad-redirect | 2 + submitted | remote-server-error | 1 + (55 rows) + +In total, this iteration of unpaywall ingest resulted in: + +- 3,325,954 raw ingest requests (new URLs total) +- 1,743,186 (52% of all) of these had not been seen/crawled from any source yet (?), and attempted to crawl +- 1,346,654 (77% of crawled) success from new heritrix crawling +- 2,690,258 (80%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success) + +## Live Ingest Follow-Up + +Will run SPN requests on the ~160k `no-capture` URLs: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json'; + => COPY 157371 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json + => 157k 0:00:04 [31.6k/s] + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + => DONE diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md new file mode 100644 index 0000000..5f92196 --- /dev/null +++ b/notes/ingest/2021-08_mag.md @@ -0,0 +1,400 @@ + +Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest. +Also want to re-ingest some old/failed ingests, now that pipeline/code has +improved. + +Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs. + + +## Persist Ingest Requests + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000}) + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request - + => 22.5M 0:46:00 [8.16k/s] + => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585}) + +Roughly 8.6 million new URLs + +## Pre-Crawl Status Counts + +Status of combined old and new requests, with some large domains removed: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------+---------- + success | 26123975 + | 6664846 + no-pdf-link | 1859908 + redirect-loop | 1532405 + no-capture | 1199126 + link-loop | 1157010 + terminal-bad-status | 832362 + gateway-timeout | 202158 + spn2-cdx-lookup-failure | 81406 + wrong-mimetype | 69087 + invalid-host-resolution | 37262 + wayback-error | 21340 + petabox-error | 11237 + null-body | 9414 + wayback-content-error | 2199 + cdx-error | 1893 + spn2-error | 1741 + spn2-error:job-failed | 971 + blocked-cookie | 902 + spn2-error:invalid-url-syntax | 336 + (20 rows) + +And just the new URLs (note that domain filter shouldn't be required, but +keeping for consistency): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + | 6664780 + success | 1957844 + redirect-loop | 23357 + terminal-bad-status | 9385 + no-pdf-link | 8315 + no-capture | 6892 + link-loop | 4517 + wrong-mimetype | 3864 + cdx-error | 1749 + blocked-cookie | 842 + null-body | 747 + wayback-error | 688 + wayback-content-error | 570 + gateway-timeout | 367 + petabox-error | 340 + spn2-cdx-lookup-failure | 150 + read-timeout | 122 + not-found | 119 + invalid-host-resolution | 63 + spn2-error | 23 + (20 rows) + +## Dump Initial Bulk Ingest Requests + +Note that this is all-time, not just recent, and will re-process a lot of +"no-pdf-link": + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-pdf-link' + OR ingest_file_result.status = 'cdx-error' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json'; + => COPY 8526647 + +Transform to ingest requests: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json + => 8.53M 0:03:40 + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + +Updated stats after running initial bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 5184994 + no-capture | 3284416 + redirect-loop | 98685 + terminal-bad-status | 28733 + link-loop | 28518 + blocked-cookie | 22338 + no-pdf-link | 19073 + wrong-mimetype | 9122 + null-body | 2793 + wayback-error | 2128 + wayback-content-error | 1233 + cdx-error | 1198 + petabox-error | 617 + gateway-timeout | 395 + not-found | 130 + read-timeout | 128 + | 111 + invalid-host-resolution | 63 + spn2-cdx-lookup-failure | 24 + spn2-error | 20 + (20 rows) + +## Generate Seedlist + +For crawling, do a similar (but not identical) dump: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json'; + => COPY 4599519 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json + => 4.60M 0:02:55 [26.2k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + => DONE + + wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt + 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + +## Post-Crawl Bulk Re-Ingest + +Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by +hash, URL agnostic). + +Enqueue for buik re-ingest: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => Thu 19 Aug 2021 09:10:59 PM UTC + + +## Post-Ingest Stats + +Just the new stuff (compare against above for delta): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 7748241 89.2% + no-capture | 429688 4.9% + redirect-loop | 172831 2.0% + terminal-bad-status | 94029 1.1% + no-pdf-link | 86437 1.0% + blocked-cookie | 67903 0.8% + link-loop | 50622 + wrong-mimetype | 21064 + null-body | 6650 + cdx-error | 3313 + wayback-error | 2630 + gateway-timeout | 399 + petabox-error | 268 + wayback-content-error | 170 + not-found | 130 + read-timeout | 128 + | 109 + invalid-host-resolution | 63 + bad-redirect | 39 + spn2-error | 20 + (20 rows) + +New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397 + +Overall success of new batch: 7748241. / 8686315 = 89.2% + +And combined (old and new) status again: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 31990062 + redirect-loop | 1704717 + no-capture | 1263462 + link-loop | 1218280 + blocked-cookie | 1213838 + no-pdf-link | 1096664 + terminal-bad-status | 960070 + gateway-timeout | 202190 + wrong-mimetype | 86557 + invalid-host-resolution | 37262 + null-body | 15443 + wayback-error | 12839 + cdx-error | 4047 + spn2-error | 1731 + spn2-error:job-failed | 962 + petabox-error | 463 + wayback-content-error | 379 + spn2-error:invalid-url-syntax | 336 + spn2-error:soft-time-limit-exceeded | 203 + | 175 + (20 rows) + +New success total: 31990062 - 26123975 = 5,866,087 + +A full 1,263,462 no-capture that could be attempted... though many of those may +be excluded for a specific reason. diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md new file mode 100644 index 0000000..ac808dd --- /dev/null +++ b/notes/ingest/2021-09-02_oai_pmh_patch.md @@ -0,0 +1,1578 @@ + +Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially +re-crawling content which failed to ingest the first time. + +May fold this in with more general patch crawling. + +## Basic Counts + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -----------------------+---------- + success | 14145387 + no-pdf-link | 12063022 + no-capture | 5485640 + redirect-loop | 2092705 + terminal-bad-status | 747372 + wrong-mimetype | 597219 + link-loop | 542144 + null-body | 93566 + cdx-error | 19798 + petabox-error | 17943 + | 15283 + wayback-error | 13897 + gateway-timeout | 511 + skip-url-blocklist | 184 + wayback-content-error | 146 + bad-redirect | 137 + redirects-exceeded | 120 + bad-gzip-encoding | 116 + timeout | 80 + blocked-cookie | 64 + (20 rows) + + SELECT + oai_prefix, + COUNT(CASE WHEN status = 'success' THEN 1 END) as success, + COUNT(*) as total + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + ) t1 + GROUP BY oai_prefix + ORDER BY total DESC + LIMIT 40; + + + oai_prefix | success | total + ---------------------------+---------+--------- + repec | 1133175 | 2783448 + hal | 573218 | 1049607 + www.irgrid.ac.cn | 18007 | 748828 + cds.cern.ch | 74078 | 688091 + americanae.aecid.es | 71310 | 572792 + juser.fz-juelich.de | 23026 | 518551 + espace.library.uq.edu.au | 6649 | 508960 + igi.indrastra.com | 59629 | 478577 + archive.ugent.be | 65306 | 424014 + hrcak.srce.hr | 404085 | 414897 + zir.nsk.hr | 156753 | 397200 + renati.sunedu.gob.pe | 79362 | 388355 + hypotheses.org | 3 | 374296 + rour.neicon.ru | 7997 | 354529 + generic.eprints.org | 263566 | 340470 + invenio.nusl.cz | 6340 | 325867 + evastar-karlsruhe.de | 62282 | 317952 + quod.lib.umich.edu | 5 | 309135 + diva.org | 67917 | 298348 + t2r2.star.titech.ac.jp | 1085 | 289388 + edpsciences.org | 139495 | 284972 + repository.ust.hk | 10245 | 283417 + revues.org | 151156 | 277497 + pure.atira.dk | 13492 | 260754 + bibliotecadigital.jcyl.es | 50606 | 254134 + escholarship.org/ark | 140835 | 245203 + ojs.pkp.sfu.ca | 168029 | 229387 + lup.lub.lu.se | 49358 | 226602 + library.wur.nl | 15051 | 216738 + digitalrepository.unm.edu | 111704 | 211749 + infoscience.tind.io | 60166 | 207299 + edoc.mpg.de | 0 | 205252 + erudit.org | 168490 | 197803 + delibra.bg.polsl.pl | 38666 | 196652 + n/a | 0 | 193814 + aleph.bib-bvb.de | 4349 | 186666 + serval.unil.ch | 41643 | 186372 + orbi.ulg.ac.be | 2400 | 184551 + digitalcommons.unl.edu | 144025 | 184372 + bib-pubdb1.desy.de | 33525 | 182717 + (40 rows) + +Top counts by OAI prefix and status: + + SELECT + oai_prefix, + status, + COUNT((oai_prefix,status)) + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + ) t1 + GROUP BY oai_prefix, status + ORDER BY COUNT DESC + LIMIT 50; + + oai_prefix | status | count + ---------------------------+---------------+--------- + repec | success | 1133175 + repec | no-pdf-link | 638105 + hal | success | 573218 + cds.cern.ch | no-capture | 540380 + repec | redirect-loop | 516451 + juser.fz-juelich.de | no-pdf-link | 477881 + americanae.aecid.es | no-pdf-link | 417766 + hrcak.srce.hr | success | 404085 + www.irgrid.ac.cn | no-pdf-link | 370908 + hal | no-pdf-link | 359252 + www.irgrid.ac.cn | no-capture | 355532 + espace.library.uq.edu.au | no-pdf-link | 320479 + igi.indrastra.com | no-pdf-link | 318242 + repec | no-capture | 316981 + invenio.nusl.cz | no-pdf-link | 309802 + rour.neicon.ru | redirect-loop | 300911 + hypotheses.org | no-pdf-link | 300251 + renati.sunedu.gob.pe | no-capture | 282800 + t2r2.star.titech.ac.jp | no-pdf-link | 272045 + generic.eprints.org | success | 263566 + quod.lib.umich.edu | no-pdf-link | 259661 + archive.ugent.be | no-capture | 256127 + evastar-karlsruhe.de | no-pdf-link | 248939 + zir.nsk.hr | link-loop | 226919 + repository.ust.hk | no-pdf-link | 208569 + edoc.mpg.de | no-pdf-link | 199758 + bibliotecadigital.jcyl.es | no-pdf-link | 188433 + orbi.ulg.ac.be | no-pdf-link | 172373 + diva.org | no-capture | 171115 + lup.lub.lu.se | no-pdf-link | 168652 + erudit.org | success | 168490 + ojs.pkp.sfu.ca | success | 168029 + lib.dr.iastate.edu | success | 158494 + zir.nsk.hr | success | 156753 + digital.kenyon.edu | success | 154900 + revues.org | success | 151156 + books.openedition.org | no-pdf-link | 149607 + freidok.uni-freiburg.de | no-pdf-link | 146837 + digitalcommons.unl.edu | success | 144025 + escholarship.org/ark | success | 140835 + culeuclid | link-loop | 140291 + edpsciences.org | success | 139495 + serval.unil.ch | no-pdf-link | 138644 + bib-pubdb1.desy.de | no-pdf-link | 133815 + krm.or.kr | no-pdf-link | 132461 + pure.atira.dk | no-pdf-link | 132179 + oai-gms.dimdi.de | redirect-loop | 131409 + aleph.bib-bvb.de | no-capture | 128261 + library.wur.nl | no-pdf-link | 124718 + lirias2repo.kuleuven.be | no-capture | 123106 + (50 rows) + +Note: could just delete the "excluded" rows? and not harvest them in the +future, and filter them at ingest time (in transform script). + + + +## Investigate no-pdf-link sandcrawler improvements + +Do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works: + + SELECT + ingest_request.link_source_id AS oai_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_request.link_source_id LIKE 'oai:library.wur.nl:%' + ORDER BY random() + LIMIT 10; + +Random sampling of *all* 'no-pdf-link' URLs (see if newer sandcrawler works): + + \x auto + + SELECT + ingest_request.link_source_id AS oai_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_file_result.status = 'no-pdf-link' + ORDER BY random() + LIMIT 30; + +### repec (SKIP-PREFIX) + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351 +base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115 +base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92 +base_url | http://pz.wz.uw.edu.pl/en +terminal_url | http://pz.wz.uw.edu.pl:80/en +-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351 +base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115 +base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412 +-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92 +base_url | http://pz.wz.uw.edu.pl/en +terminal_url | http://pz.wz.uw.edu.pl:80/en +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:erv:rccsrc:y:2016:i:2016_11:35 +base_url | http://www.eumed.net/rev/caribe/2016/11/estructura.html +terminal_url | http://www.eumed.net:80/rev/caribe/2016/11/estructura.html +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:pio:envira:v:33:y:2001:i:4:p:629-647 +base_url | http://www.envplan.com/epa/fulltext/a33/a3319.pdf +terminal_url | http://uk.sagepub.com:80/en-gb/eur/pion-journals-published +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:tpr:qjecon:v:100:y:1985:i:3:p:651-75 +base_url | http://links.jstor.org/sici?sici=0033-5533%28198508%29100%3A3%3C651%3ATCOCEA%3E2.0.CO%3B2-2&origin=repec +terminal_url | https://www.jstor.org/stable/1884373 + +Huh! This is just a catalog of other domains. Should probably skip + +DONE: skip/filter repec + +### juser.fz-juelich.de (SCOPE) + +-[ RECORD 1 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:132217 +base_url | http://juser.fz-juelich.de/record/132217 +terminal_url | http://juser.fz-juelich.de/record/132217 + +Poster; no files. + +-[ RECORD 2 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:268598 +base_url | http://juser.fz-juelich.de/record/268598 +terminal_url | http://juser.fz-juelich.de/record/268598 + +Journal. + +-[ RECORD 3 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:126613 +base_url | http://juser.fz-juelich.de/record/126613 +terminal_url | http://juser.fz-juelich.de/record/126613 + +-[ RECORD 4 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:67362 +base_url | http://juser.fz-juelich.de/record/67362 +terminal_url | http://juser.fz-juelich.de/record/67362 +-[ RECORD 5 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:869189 +base_url | http://juser.fz-juelich.de/record/869189 +terminal_url | http://juser.fz-juelich.de/record/869189 +-[ RECORD 6 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:810746 +base_url | http://juser.fz-juelich.de/record/810746 +terminal_url | http://juser.fz-juelich.de/record/810746 +-[ RECORD 7 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:52897 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22 +-[ RECORD 8 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:114755 +base_url | http://juser.fz-juelich.de/record/114755 +terminal_url | http://juser.fz-juelich.de/record/114755 +-[ RECORD 9 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:58025 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22 + +The search URLs seem redundant? Not going to try to handle those. + +"Powered by Invenio v1.1.7" + +All of these examples seem to be not papers. Maybe we can filter these better +at the harvest or transform stage? + +### americanae.aecid.es (MIXED) + +-[ RECORD 1 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:502896 +base_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai +terminal_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai + +just a metadata record? links to redalyc + +METADATA-ONLY + +-[ RECORD 2 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:534600 +base_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1 +terminal_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1 +-[ RECORD 3 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:524567 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567 + +NOT-FOUND (404) + +-[ RECORD 4 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:378914 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914 + +Some single-page image archival thing? bespoke, skipping. + +SKIP-BESPOKE + +-[ RECORD 5 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:526142 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142 + +NOT-FOUND (404) + +-[ RECORD 6 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:373408 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408 + +NOT-FOUND (404) + +### www.irgrid.ac.cn (SKIP-PREFIX) + +Chinese Academy of Sciences Institutional Repositories Grid + +-[ RECORD 1 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1749980 +base_url | http://www.irgrid.ac.cn/handle/1471x/1749980 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1749980 + +Can't access + +FORBIDDEN + +-[ RECORD 2 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/857397 +base_url | http://www.irgrid.ac.cn/handle/1471x/857397 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/857397 + +Just linking to another IR; skip it. + +http://ir.ipe.ac.cn/handle/122111/10608 + +requires login + +DONE: '/password-login;jsessionid' as a loginwall URL pattern + http://ir.ipe.ac.cn/handle/122111/10608 + http://ir.ipe.ac.cn/bitstream/122111/10608/2/%e9%92%9d%e9%a1%b6%e8%9e%ba%e6%97%8b%e8%97%bb%e5%9c%a8%e4%b8%8d%e5%90%8c%e5%85%89%e7%85%a7%e6%9d%a1%e4%bb%b6%e4%b8%8b%e7%9a%84%e6%94%be%e6%b0%a7%e7%89%b9%e6%80%a7_%e8%96%9b%e5%8d%87%e9%95%bf.pdf + +-[ RECORD 3 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1060447 +base_url | http://www.irgrid.ac.cn/handle/1471x/1060447 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1060447 +-[ RECORD 4 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1671377 +base_url | http://ir.iggcas.ac.cn/handle/132A11/68622 +terminal_url | http://ir.iggcas.ac.cn/handle/132A11/68622 +-[ RECORD 5 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1178430 +base_url | http://www.irgrid.ac.cn/handle/1471x/1178430 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1178430 +-[ RECORD 6 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/2488017 +base_url | http://www.irgrid.ac.cn/handle/1471x/2488017 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/2488017 +-[ RECORD 7 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/977147 +base_url | http://www.irgrid.ac.cn/handle/1471x/977147 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/977147 +-[ RECORD 8 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/2454503 +base_url | http://ir.nwipb.ac.cn/handle/363003/9957 +terminal_url | http://ir.nwipb.ac.cn/handle/363003/9957 + +this domain is a disapointment :( + +should continue crawling, as the metadata is open and good. but won't get fulltext? + +### hal (FIXED-PARTIAL) + +-[ RECORD 1 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00744951v1 +base_url | https://hal.archives-ouvertes.fr/hal-00744951 +terminal_url | https://hal.archives-ouvertes.fr/hal-00744951 + +Off-site OA link. + +FIXED-HAL + +-[ RECORD 2 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-01065398v1 +base_url | https://hal.archives-ouvertes.fr/hal-01065398/file/AbstractSGE14_B_assaad.pdf +terminal_url | https://hal.archives-ouvertes.fr/index/index +-[ RECORD 3 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:lirmm-00371599v1 +base_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599 +terminal_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599 + +To elsevier :( + +-[ RECORD 4 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00284780v1 +base_url | https://hal.archives-ouvertes.fr/hal-00284780 +terminal_url | https://hal.archives-ouvertes.fr/hal-00284780 + +METADATA-ONLY + +-[ RECORD 5 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00186151v1 +base_url | https://hal.archives-ouvertes.fr/hal-00186151 +terminal_url | https://hal.archives-ouvertes.fr/hal-00186151 + +METADATA-ONLY + +-[ RECORD 6 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00399754v1 +base_url | https://hal.archives-ouvertes.fr/hal-00399754 +terminal_url | https://hal.archives-ouvertes.fr/hal-00399754 + +METADATA-ONLY + + +### espace.library.uq.edu.au (SKIP) + +-[ RECORD 1 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:136497 +base_url | https://espace.library.uq.edu.au/view/UQ:136497 +terminal_url | https://espace.library.uq.edu.au/view/UQ:136497 +-[ RECORD 2 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:411389 +base_url | https://espace.library.uq.edu.au/view/UQ:411389 +terminal_url | https://espace.library.uq.edu.au/view/UQ:411389 +-[ RECORD 3 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:401773 +base_url | https://espace.library.uq.edu.au/view/UQ:401773 +terminal_url | https://espace.library.uq.edu.au/view/UQ:401773 +-[ RECORD 4 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:675334 +base_url | https://espace.library.uq.edu.au/view/UQ:675334 +terminal_url | https://espace.library.uq.edu.au/view/UQ:675334 +-[ RECORD 5 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:312311 +base_url | https://espace.library.uq.edu.au/view/UQ:312311 +terminal_url | https://espace.library.uq.edu.au/view/UQ:312311 +-[ RECORD 6 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:209401 +base_url | https://espace.library.uq.edu.au/view/UQ:209401 +terminal_url | https://espace.library.uq.edu.au/view/UQ:209401 +-[ RECORD 7 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:327188 +base_url | https://espace.library.uq.edu.au/view/UQ:327188 +terminal_url | https://espace.library.uq.edu.au/view/UQ:327188 + +Very javascript heavy (skeletal HTML). And just links to fulltext on publisher +sites. + +### igi.indrastra.com (METADATA-ONLY) + +-[ RECORD 1 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:267221 +base_url | http://igi.indrastra.com/items/show/267221 +terminal_url | http://igi.indrastra.com/items/show/267221 +-[ RECORD 2 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:181799 +base_url | http://igi.indrastra.com/items/show/181799 +terminal_url | http://igi.indrastra.com/items/show/181799 +-[ RECORD 3 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:125382 +base_url | http://igi.indrastra.com/items/show/125382 +terminal_url | http://igi.indrastra.com/items/show/125382 +-[ RECORD 4 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:47266 +base_url | http://igi.indrastra.com/items/show/47266 +terminal_url | http://igi.indrastra.com/items/show/47266 +-[ RECORD 5 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:12872 +base_url | http://igi.indrastra.com/items/show/12872 +terminal_url | http://igi.indrastra.com/items/show/12872 +-[ RECORD 6 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:231620 +base_url | http://igi.indrastra.com/items/show/231620 +terminal_url | http://igi.indrastra.com/items/show/231620 + +"Proudly powered by Omeka" + +### invenio.nusl.cz (METADATA-ONLY) + + oai_id | base_url | terminal_url +----------------------------+------------------------------------+-------------------------------------- + oai:invenio.nusl.cz:237409 | http://www.nusl.cz/ntk/nusl-237409 | http://invenio.nusl.cz/record/237409 + oai:invenio.nusl.cz:180783 | http://www.nusl.cz/ntk/nusl-180783 | http://invenio.nusl.cz/record/180783 + oai:invenio.nusl.cz:231961 | http://www.nusl.cz/ntk/nusl-231961 | http://invenio.nusl.cz/record/231961 + oai:invenio.nusl.cz:318800 | http://www.nusl.cz/ntk/nusl-318800 | http://invenio.nusl.cz/record/318800 + oai:invenio.nusl.cz:259695 | http://www.nusl.cz/ntk/nusl-259695 | http://invenio.nusl.cz/record/259695 + oai:invenio.nusl.cz:167393 | http://www.nusl.cz/ntk/nusl-167393 | http://invenio.nusl.cz/record/167393 + oai:invenio.nusl.cz:292987 | http://www.nusl.cz/ntk/nusl-292987 | http://invenio.nusl.cz/record/292987 + oai:invenio.nusl.cz:283396 | http://www.nusl.cz/ntk/nusl-283396 | http://invenio.nusl.cz/record/283396 + oai:invenio.nusl.cz:241512 | http://www.nusl.cz/ntk/nusl-241512 | http://invenio.nusl.cz/record/241512 + oai:invenio.nusl.cz:178631 | http://www.nusl.cz/ntk/nusl-178631 | http://invenio.nusl.cz/record/178631 + +Metadata only (at least this set) + +### hypotheses.org + +-[ RECORD 1 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:mittelalter/9529 +base_url | http://mittelalter.hypotheses.org/9529 +terminal_url | https://mittelalter.hypotheses.org/9529 +-[ RECORD 2 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivalia/18638 +base_url | http://archivalia.hypotheses.org/18638 +terminal_url | https://archivalia.hypotheses.org/18638 +-[ RECORD 3 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivalia/13614 +base_url | http://archivalia.hypotheses.org/13614 +terminal_url | https://archivalia.hypotheses.org/13614 +-[ RECORD 4 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:teteschercheuses/2785 +base_url | http://teteschercheuses.hypotheses.org/2785 +terminal_url | https://teteschercheuses.hypotheses.org/2785 +-[ RECORD 5 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:altervsego/608 +base_url | http://altervsego.hypotheses.org/608 +terminal_url | http://altervsego.hypotheses.org/608 +-[ RECORD 6 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivewk1/21905 +base_url | http://archivewk1.hypotheses.org/21905 +terminal_url | https://archivewk1.hypotheses.org/21905 +-[ RECORD 7 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:slkdiaspo/3321 +base_url | http://slkdiaspo.hypotheses.org/3321 +terminal_url | https://slkdiaspo.hypotheses.org/3321 +-[ RECORD 8 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:diga/280 +base_url | http://diga.hypotheses.org/280 +terminal_url | https://diga.hypotheses.org/280 + +These are all a big mix... basically blogs. Should continue crawling, but expect no yield. + +### t2r2.star.titech.ac.jp (METADATA-ONLY) + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00105099 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00101346 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50161100 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554 +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00232407 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528 +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50120040 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598 +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50321440 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492 +-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50235666 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778 + + +### quod.lib.umich.edu + +-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acf2679.0015.003-2 +base_url | http://name.umdl.umich.edu/acf2679.0015.003 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0015.003 +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:b14970.0001.001 +base_url | http://name.umdl.umich.edu/B14970.0001.001 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=eebo2;idno=B14970.0001.001 +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acf2679.0009.010-3 +base_url | http://name.umdl.umich.edu/ACF2679-1623SOUT-209 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0009.010;node=acf2679.0009.010:3 +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg2248.1-16.006-43 +base_url | http://name.umdl.umich.edu/acg2248.1-16.006 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-16.006 +-[ RECORD 5 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg2248.1-14.011-9 +base_url | http://name.umdl.umich.edu/ACG2248-1489LADI-364 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-14.011;node=acg2248.1-14.011:9 +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg1336.1-24.006-9 +base_url | http://name.umdl.umich.edu/acg1336.1-24.006 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg1336.1-24.006 +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:africanamer.0002.32a +base_url | http://name.umdl.umich.edu/africanamer.0002.32a +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=africanamer;idno=africanamer.0002.32a + +These are... issues of journals? Should continue to crawl, but not expect much. + +### evastar-karlsruhe.de (METADATA-ONLY) + +-[ RECORD 1 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:270011444 +base_url | https://publikationen.bibliothek.kit.edu/270011444 +terminal_url | https://publikationen.bibliothek.kit.edu/270011444 +-[ RECORD 2 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:1000050117 +base_url | https://publikationen.bibliothek.kit.edu/1000050117 +terminal_url | https://publikationen.bibliothek.kit.edu/1000050117 +-[ RECORD 3 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:362296 +base_url | https://publikationen.bibliothek.kit.edu/362296 +terminal_url | https://publikationen.bibliothek.kit.edu/362296 +-[ RECORD 4 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:23042000 +base_url | https://publikationen.bibliothek.kit.edu/23042000 +terminal_url | https://publikationen.bibliothek.kit.edu/23042000 +-[ RECORD 5 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:1000069945 +base_url | https://publikationen.bibliothek.kit.edu/1000069945 +terminal_url | https://publikationen.bibliothek.kit.edu/1000069945 + + +### repository.ust.hk + +-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-67233 +base_url | http://repository.ust.hk/ir/Record/1783.1-67233 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-67233 +-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-63232 +base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=A1981KV47900017 +terminal_url | http://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253AA1981KV47900017%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=http%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com +-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-2891 +base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=000240035400103 +terminal_url | https://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253A000240035400103%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=https%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com +-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-56231 +base_url | http://repository.ust.hk/ir/Record/1783.1-56231 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-56231 + +[...] + +-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-24872 +base_url | http://repository.ust.hk/ir/Record/1783.1-24872 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-24872 +-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-3457 +base_url | http://lbdiscover.ust.hk/uresolver?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations +terminal_url | http://lbdiscover.ust.hk/uresolver/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations +-[ RECORD 8 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-73215 +base_url | http://repository.ust.hk/ir/Record/1783.1-73215 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-73215 + +DONE: gateway.isiknowledge.com is bogus/blocking? + + +### edoc.mpg.de (SKIP-DEPRECATED) + + oai_id | base_url | terminal_url +------------------------+---------------------------+--------------------------- + oai:edoc.mpg.de:416650 | http://edoc.mpg.de/416650 | http://edoc.mpg.de/416650 + oai:edoc.mpg.de:8195 | http://edoc.mpg.de/8195 | http://edoc.mpg.de/8195 + oai:edoc.mpg.de:379655 | http://edoc.mpg.de/379655 | http://edoc.mpg.de/379655 + oai:edoc.mpg.de:641179 | http://edoc.mpg.de/641179 | http://edoc.mpg.de/641179 + oai:edoc.mpg.de:607141 | http://edoc.mpg.de/607141 | http://edoc.mpg.de/607141 + oai:edoc.mpg.de:544412 | http://edoc.mpg.de/544412 | http://edoc.mpg.de/544412 + oai:edoc.mpg.de:314531 | http://edoc.mpg.de/314531 | http://edoc.mpg.de/314531 + oai:edoc.mpg.de:405047 | http://edoc.mpg.de/405047 | http://edoc.mpg.de/405047 + oai:edoc.mpg.de:239650 | http://edoc.mpg.de/239650 | http://edoc.mpg.de/239650 + oai:edoc.mpg.de:614852 | http://edoc.mpg.de/614852 | http://edoc.mpg.de/614852 + +This whole instance seems to have been replaced + +### bibliotecadigital.jcyl.es (SKIP-DIGITIZED) + +-[ RECORD 1 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:10000039962 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664 +-[ RECORD 2 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:14075 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075 +-[ RECORD 3 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:4842 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842 +-[ RECORD 4 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:14799 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799 +-[ RECORD 5 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:821 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474 + +Digitized images as pages; too much to deal with for now. + +### orbi.ulg.ac.be + +-[ RECORD 1 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/128079 +base_url | https://orbi.uliege.be/handle/2268/128079 +terminal_url | https://orbi.uliege.be/handle/2268/128079 +-[ RECORD 2 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/67659 +base_url | https://orbi.uliege.be/handle/2268/67659 +terminal_url | https://orbi.uliege.be/handle/2268/67659 +-[ RECORD 3 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/35521 +base_url | https://orbi.uliege.be/handle/2268/35521 +terminal_url | https://orbi.uliege.be/handle/2268/35521 +-[ RECORD 4 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/107922 +base_url | https://orbi.uliege.be/handle/2268/107922 +terminal_url | https://orbi.uliege.be/handle/2268/107922 +-[ RECORD 5 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/215694 +base_url | https://orbi.uliege.be/handle/2268/215694 +terminal_url | https://orbi.uliege.be/handle/2268/215694 + +Described below. + +### library.wur.nl (FIXED-BESPOKE) + + oai_id | base_url | terminal_url + -----------------------------------+------------------------------------------------+------------------------------------------------ + oai:library.wur.nl:wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 + oai:library.wur.nl:wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 + oai:library.wur.nl:wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 + oai:library.wur.nl:wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 + oai:library.wur.nl:wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 + oai:library.wur.nl:wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 + oai:library.wur.nl:wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 + oai:library.wur.nl:wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 + oai:library.wur.nl:wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 + oai:library.wur.nl:wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 + (10 rows) + +Seems like a one-off site? But added a pattern. + +### pure.atira.dk + +-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/a27762fd-0919-4753-af55-00b9b26d02e0 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/215c8b96-a821-4947-bee4-c7470e9fbaf8 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/95d4920a-12c7-4e25-b86c-5f075ea23a38 +base_url | https://www.tandfonline.com/doi/full/10.1080/03057070.2016.1197694 +terminal_url | https://www.tandfonline.com/action/cookieAbsent +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/8a2508ee-14c9-4c6a-851a-6db442090f41 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html + +Metadata only + +DONE: /cookieAbsent is cookie block + https://www.tandfonline.com/action/cookieAbsent + +### bib-pubdb1.desy.de (FIXED-INVENIO) + +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:96756 +base_url | http://bib-pubdb1.desy.de/record/96756 +terminal_url | http://bib-pubdb1.desy.de/record/96756 + +Metadata only. + +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:416556 +base_url | http://bib-pubdb1.desy.de/record/416556 +terminal_url | http://bib-pubdb1.desy.de/record/416556 + +Fixed! + +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:414545 +base_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22 +terminal_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22 +-[ RECORD 5 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:170169 +base_url | http://bib-pubdb1.desy.de/record/170169 +terminal_url | http://bib-pubdb1.desy.de/record/170169 +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:191154 +base_url | http://bib-pubdb1.desy.de/record/191154 +terminal_url | http://bib-pubdb1.desy.de/record/191154 + +Metadata only + +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:155092 +base_url | http://bib-pubdb1.desy.de/record/155092 +terminal_url | http://bib-pubdb1.desy.de/record/155092 + +Fixed! + +-[ RECORD 8 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:97158 +base_url | http://bib-pubdb1.desy.de/record/97158 +terminal_url | http://bib-pubdb1.desy.de/record/97158 + +Metadata only + +"Powered by Invenio v1.1.7" + +Can/should skip the "search" URLs + +### serval.unil.ch + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_60346fc75171 +base_url | https://serval.unil.ch/notice/serval:BIB_60346FC75171 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_60346FC75171 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_4db47fc4b593 +base_url | https://serval.unil.ch/notice/serval:BIB_4DB47FC4B593 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_4DB47FC4B593 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_57aac24fe115 +base_url | http://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154 +terminal_url | https://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154 +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_deabae6baf6c +base_url | https://serval.unil.ch/notice/serval:BIB_DEABAE6BAF6C +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DEABAE6BAF6C +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_a5ec0df1370f +base_url | https://serval.unil.ch/notice/serval:BIB_A5EC0DF1370F +terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253Aed270c26d4a36cefd1bf6a840472abe0ee5556cb5f3b42de708f3ea984775dfd +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_080300c2e23c +base_url | https://serval.unil.ch/resource/serval:BIB_080300C2E23C.P001/REF.pdf +terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A154453d78a0fb75ffa220f7b6fe73b29447fa6ed048addf31897b41001f44679 +-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_de777dd2b07f +base_url | https://serval.unil.ch/notice/serval:BIB_DE777DD2B07F +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DE777DD2B07F +-[ RECORD 8 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_5e824e244c27 +base_url | https://serval.unil.ch/notice/serval:BIB_5E824E244C27 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_5E824E244C27 + +Metadata only? See elsewhere. + +### Random Links + +-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dbc.wroc.pl:41031 +base_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031 +terminal_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031 + +This is some platform/package thing. PDF is in an iframe. Platform is "DLibra". +FIXED-DLIBRA + +-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/174291 +base_url | https://orbi.uliege.be/handle/2268/174291 +terminal_url | https://orbi.uliege.be/handle/2268/174291 + +DSpace platform. There are multiple files, and little to "select" on. + +https://orbi.uliege.be/handle/2268/174200 has only single PDF and easier to work with + +PARTIAL-DSPACE + +-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:library.tue.nl:664163 +base_url | http://repository.tue.nl/664163 +terminal_url | http://repository.tue.nl/664163 + +Ah, this is the Pure platform from Elsevier. +Redirects to: https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance + +FIXED-PURE + + +-[ RECORD 4 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:juser.fz-juelich.de:49579 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22 + +(handled above) + +-[ RECORD 5 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.mit.edu:1721.1/97937 +base_url | https://orcid.org/0000-0002-2066-2082 +terminal_url | https://orcid.org/0000-0002-2066-2082 + +ORCID! Skip it. + +DONE: skip orcid.org in `terminal_url`, and/or at harvest/transform time. + +-[ RECORD 6 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:edoc.mpg.de:360269 +base_url | http://edoc.mpg.de/360269 +terminal_url | http://edoc.mpg.de/360269 + +Seems like this whole repo has disapeared, or been replaced by... pure? maybe a different pure? + +DONE: edoc.mpg.de -> pure.mpg.de + +-[ RECORD 7 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:books.openedition.org:msha/17716 +base_url | http://books.openedition.org/msha/17716 +terminal_url | https://books.openedition.org/msha/17716 + +Open edition is free to read HTML, but not PDF (or epub, etc). + +TODO: for some? all? openedition books records, try HTML ingest (not PDF ingest) + +HTML-WORKED + +-[ RECORD 8 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:epub.oeaw.ac.at:0x003aba48 +base_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf +terminal_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf + +requires login + +FORBIDDEN + +-[ RECORD 9 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.mit.edu:1721.1/88986 +base_url | https://orcid.org/0000-0002-4147-2560 +terminal_url | https://orcid.org/0000-0002-4147-2560 + +DONE: skip orcids + +-[ RECORD 10 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-28786 +base_url | http://repository.ust.hk/ir/Record/1783.1-28786 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-28786 + +Generator: VuFind 5.1.1 +just a metadata record + +METADATA-ONLY + +-[ RECORD 11 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:rcin.org.pl:50797 +base_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472 +terminal_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472 + +Seems like a software platform? not sure. + +METADATA-ONLY + +-[ RECORD 12 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dea.lib.unideb.hu:2437/69641 +base_url | http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&resultview=long&ccltext=idno+bibFSZ1008709 +terminal_url | https://webpac.lib.unideb.hu/WebPac/CorvinaWeb?action=cclfind&resultview=long&ccltext=idno+bibFSZ1008709 + +-[ RECORD 13 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:unsworks.library.unsw.edu.au:1959.4/64871 +base_url | http://handle.unsw.edu.au/1959.4/64871 +terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_62832&context=L + +-[ RECORD 14 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:www.wbc.poznan.pl:225930 +base_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930 +terminal_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930 + +SOFT-404 + +-[ RECORD 15 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.erciyes.edu.tr:105 +base_url | http://repository.erciyes.edu.tr/bilimname/items/show/105 +terminal_url | http://repository.erciyes.edu.tr:80/bilimname/items/show/105 + +GONE (domain not registered) + +-[ RECORD 16 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digi.ub.uni-heidelberg.de:37500 +base_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13 +terminal_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13 + +Seems like a bespoke site + +SKIP-BESPOKE + +-[ RECORD 17 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50401364 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313 + +METADATA-ONLY + +-[ RECORD 18 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:epubs.cclrc.ac.uk:work/4714 +base_url | http://purl.org/net/epubs/work/4714 +terminal_url | https://epubs.stfc.ac.uk/work/4714 + +It's got a purl! haha. + +METADATA-ONLY + +------ + +Another batch! With some repeat domains removed. + +-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:cris.vtt.fi:persons/142c030f-ba7b-491a-8669-a361088355cc +base_url | https://cris.vtt.fi/en/persons/142c030f-ba7b-491a-8669-a361088355cc +terminal_url | https://cris.vtt.fi/en/persons/oleg-antropov + +SKIP + +-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:etd.adm.unipi.it:etd-05302014-183910 +base_url | http://etd.adm.unipi.it/theses/available/etd-05302014-183910/ +terminal_url | https://etd.adm.unipi.it/theses/available/etd-05302014-183910/ + +Some software platform? Pretty basic/bespoke + +FIXED-PARTIAL + +-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:10000098246 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451 + +SKIP (see elsewhere) + +-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:elektra.cdaea.es:documento.29259 +base_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259 +terminal_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259 + +Photo. + +SKIP-SCOPE + +-[ RECORD 9 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:unsworks.library.unsw.edu.au:1959.4/unsworks_60829 +base_url | http://handle.unsw.edu.au/1959.4/unsworks_60829 +terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_modsunsworks_60829&context=L + +METADATA-ONLY + +-[ RECORD 12 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:pure.leuphana.de:publications/7d040cf2-b3b5-4671-8906-76b5bc8d870a +base_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html +terminal_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html + +unsure + +-[ RECORD 16 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:library.wur.nl:wurpubs/369344 +base_url | https://library.wur.nl/WebQuery/wurpubs/369344 +terminal_url | https://library.wur.nl/WebQuery/wurpubs/369344 + +this specific record not OA (but site is fine/fixed) + +-[ RECORD 17 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:escholarship.umassmed.edu:oapubs-2146 +base_url | https://escholarship.umassmed.edu/oapubs/1147 +terminal_url | http://escholarship.umassmed.edu/oapubs/1147/ + +just links to publisher (no content in repo) + +-[ RECORD 18 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digitalcommons.usu.edu:wild_facpub-1010 +base_url | https://digitalcommons.usu.edu/wild_facpub/11 +terminal_url | http://digitalcommons.usu.edu/wild_facpub/11/ + +also just links to publisher (no content in repo) + +-[ RECORD 25 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:igi.indrastra.com:306768 +base_url | http://igi.indrastra.com/items/show/306768 +terminal_url | http://igi.indrastra.com/items/show/306768 + +(see elsewhere) + +-[ RECORD 26 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:fau.digital.flvc.org:fau_9804 +base_url | http://purl.flvc.org/fcla/dt/12932 +terminal_url | http://fau.digital.flvc.org/islandora/object/fau%3A9804 + +Islandora. + +-[ RECORD 27 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.lu.lv:7/16019 +base_url | https://dspace.lu.lv/dspace/handle/7/16019 +terminal_url | https://dspace.lu.lv/dspace/handle/7/16019 + +LOGINWALL + +-[ RECORD 28 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:zir.nsk.hr:umas_218 +base_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218 +terminal_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218 + +REMOVED + + +-[ RECORD 29 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digi.ub.uni-heidelberg.de:36390 +base_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5 +terminal_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5 + +Book, with chapters, not an individual work. + +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:krm.or.kr:10056135m201r +base_url | https://www.krm.or.kr/krmts/link.html?dbGubun=SD&m201_id=10056135&res=y +terminal_url | https://www.krm.or.kr/krmts/search/detailview/research.html?dbGubun=SD&category=Research&m201_id=10056135 + +research results repository; keep crawling + +SKIP-SCOPE + +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:www.db-thueringen.de:dbt_mods_00005191 +base_url | https://www.db-thueringen.de/receive/dbt_mods_00005191 +terminal_url | https://www.db-thueringen.de/receive/dbt_mods_00005191 + +powered by "MyCoRe" + +FIXED-MYCORE + +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:bibliotecavirtualandalucia.juntadeandalucia.es:1017405 +base_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405 +terminal_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405 + +seems to be a general purpose regional library? not research-specific + +SKIP-UNSURE + +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:etd.adm.unipi.it:etd-02272019-123644 +base_url | http://etd.adm.unipi.it/theses/available/etd-02272019-123644/ +terminal_url | https://etd.adm.unipi.it/theses/available/etd-02272019-123644/ + +This specific URL is not available (FORBIDDEN) + +others have multiple files, not just a single PDF: +https://etd.adm.unipi.it/t/etd-09102013-124430/ + +SKIP-UNSURE + +-[ RECORD 9 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:commons.ln.edu.hk:sw_master-5408 +base_url | https://commons.ln.edu.hk/sw_master/4408 +terminal_url | https://commons.ln.edu.hk/sw_master/4408/ + +worth crawling I guess + +METADATA-ONLY + +-[ RECORD 10 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:mouseion.jax.org:ssbb1976-1224 +base_url | https://mouseion.jax.org/ssbb1976/225 +terminal_url | https://mouseion.jax.org/ssbb1976/225/ + +METADATA-ONLY + +-[ RECORD 13 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:aleph.bib-bvb.de:bvb01-016604343 +base_url | http://bvbm1.bib-bvb.de/webclient/DeliveryManager?pid=176332&custom_att_2=simple_viewer +terminal_url | http://digital.bib-bvb.de/view/action/singleViewer.do?dvs=1593269021002~476&locale=en_US&VIEWER_URL=/view/action/singleViewer.do?&DELIVERY_RULE_ID=31&frameId=1&usePid1=true&usePid2=true + +SOFT-404 / FORBIDDEN (cookie timeout) + +-[ RECORD 14 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:bivaldi.gva.es:11740 +base_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740 +terminal_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740 + + +-[ RECORD 16 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:library.wur.nl:wurpubs/443282 +base_url | https://library.wur.nl/WebQuery/wurpubs/443282 +terminal_url | https://library.wur.nl/WebQuery/wurpubs/443282 + +DIGIBIS platform (like some others) + +FIXED-PARTIAL + +-[ RECORD 18 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:hal:in2p3-00414135v1 +base_url | http://hal.in2p3.fr/in2p3-00414135 +terminal_url | http://hal.in2p3.fr:80/in2p3-00414135 + +METADATA-ONLY + +-[ RECORD 19 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:aaltodoc.aalto.fi:123456789/13201 +base_url | https://aaltodoc.aalto.fi/handle/123456789/13201 +terminal_url | https://aaltodoc.aalto.fi/handle/123456789/13201 + +This specific record is not accessible. +Another: https://aaltodoc.aalto.fi/handle/123456789/38002 + +DSpace 5.4 + +Worked (from recent changes) + + +-[ RECORD 20 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:sedici.unlp.edu.ar:10915/40144 +base_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view +terminal_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view + +This is a journal! Cool. Plone software platform. + +FIXED + +## Top no-capture Domains + +Top terminal no-capture domains: + + SELECT domain, COUNT(domain) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_file_result.status = 'no-capture' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 30; + + domain | count + -----------------------------------+------- + digitalrepository.unm.edu | 94087 + escholarship.org | 80632 + ir.opt.ac.cn | 70504 + idus.us.es | 67908 + www.cambridge.org | 56376 + www.ssoar.info | 52534 + rep.bntu.by | 52127 + scholarworks.umt.edu | 48546 + publikationen.ub.uni-frankfurt.de | 46987 + dk.um.si | 45753 + repositorio.uladech.edu.pe | 37028 + uu.diva-portal.org | 34929 + digitalcommons.law.byu.edu | 31732 + sedici.unlp.edu.ar | 31233 + elib.sfu-kras.ru | 29131 + jyx.jyu.fi | 28144 + www.repository.cam.ac.uk | 27728 + nagoya.repo.nii.ac.jp | 26673 + www.duo.uio.no | 25258 + www.persee.fr | 24968 + www2.senado.leg.br | 24426 + tesis.ucsm.edu.pe | 24049 + digitalcommons.unl.edu | 21974 + www.degruyter.com | 21940 + www.igi-global.com | 20736 + thekeep.eiu.edu | 20712 + docs.lib.purdue.edu | 20538 + repositorio.cepal.org | 20280 + elib.bsu.by | 19620 + minds.wisconsin.edu | 19473 + (30 rows) + +These all seem worth crawling. A couple publishers (cambridge.org), and +persee.fr will probably fail, but not too many URLs. + +## Summary of Filtered Prefixes and Domains (OAI-PMH) + +oai:kb.dk: + too large and generic +oai:bdr.oai.bsb-muenchen.de: + too large and generic +oai:hispana.mcu.es: + too large and generic +oai:bnf.fr: + too large and generic +oai:ukm.si: + too large and generic +oai:biodiversitylibrary.org: + redundant with other ingest and archive.org content +oai:hsp.org: + large; historical content only +oai:repec: + large; mostly (entirely?) links to publisher sites +oai:n/a: + meta? +oai:quod.lib.umich.edu: + entire issues? hard to crawl so skip for now +oai:hypotheses.org: + HTML, not PDF +oai:americanae.aecid.es: + large, complex. skip for now +oai:www.irgrid.ac.cn: + aggregator of other IRs +oai:espace.library.uq.edu.au: + large; metadata only; javascript heavy (poor heritrix crawling) +oai:edoc.mpg.de: + deprecated domain, with no redirects +oai:bibliotecadigital.jcyl.es: + digitized historical docs; hard to crawl, skip for now +oai:repository.erciyes.edu.tr: + gone (domain lapsed) +oai:krm.or.kr: + "research results repository" (metadata only) + +www.kb.dk + large, general purpose, scope +kb-images.kb.dk + deprecated +mdz-nbn-resolving.de + multiple prefixes end up here. historical docs, scope +aggr.ukm.um.si + large, out of scope +edoc.mpg.de + deprecated domain +doaj.org + index (metadata only) +orcid.org + out of scope +gateway.isiknowledge.com + clarivate login/payall (skipping in ingest) + +Needs filtering to a subset of records (by 'set' or other filtering?): + +oai:igi.indrastra.com: +oai:invenio.nusl.cz: +oai:t2r2.star.titech.ac.jp: +oai:evastar-karlsruhe.de: +oai:repository.ust.hk: +oai:serval.unil.ch: +oai:pure.atira.dk: + +FIlters in SQL syntax: + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + +and in some contexts (PDFs; switch to HTML): + + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + +## Overall Summary of OAI-PMH Stuff + +Big picture is that the majority of `no-pdf-link` crawl status are because of +repository scope, record scope, or content format issues. That being said, +there was a sizable fraction of sites which were platforms (like DSpace) which +were not ingesting well. + +A significant fraction of records are "metadata only" (of papers), or non-paper +entity types (like persons, grants, or journal titles), and a growing fraction +(?) are metadata plus link to OA publisher fulltext (offsite). Might be +possible to detect these at ingest time, or earlier at OAI-PMH +harvest/transform time and filter them out. + +It may be worthwhile to attempt ingest of multiple existing captures +(timestamps) in the ingest pipeline. Eg, instead of chosing a single "best" +capture, if there are multiple HTTP 200 status captures, try ingest with each +(or at least a couple). This is because repository software gets upgraded, so +old "no-capture" or "not found" or "link loop" type captures may work when +recrawled. + +New summary with additional filters: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -----------------------+---------- + success | 12872279 + no-pdf-link | 9329602 + no-capture | 4696362 + redirect-loop | 1541458 + terminal-bad-status | 660418 + link-loop | 452831 + wrong-mimetype | 434868 + null-body | 71065 + cdx-error | 17005 + | 15275 + petabox-error | 12743 + wayback-error | 11759 + skip-url-blocklist | 182 + gateway-timeout | 122 + redirects-exceeded | 120 + bad-redirect | 117 + bad-gzip-encoding | 111 + wayback-content-error | 102 + timeout | 72 + blocked-cookie | 62 + (20 rows) + diff --git a/notes/ingest/2021-09-03_daily_improvements.md b/notes/ingest/2021-09-03_daily_improvements.md new file mode 100644 index 0000000..a0bb0c5 --- /dev/null +++ b/notes/ingest/2021-09-03_daily_improvements.md @@ -0,0 +1,1021 @@ + +Periodic check-in of daily crawling/ingest. + +Overall ingest status, past 30 days: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + + ingest_type | status | count + -------------+-------------------------+-------- + pdf | no-pdf-link | 158474 + pdf | spn2-cdx-lookup-failure | 135344 + pdf | success | 127938 + pdf | spn2-error | 65411 + pdf | gateway-timeout | 63112 + pdf | blocked-cookie | 26338 + pdf | terminal-bad-status | 24853 + pdf | link-loop | 15699 + pdf | spn2-error:job-failed | 13862 + pdf | redirect-loop | 11432 + pdf | cdx-error | 2376 + pdf | too-many-redirects | 2186 + pdf | wrong-mimetype | 2142 + pdf | forbidden | 1758 + pdf | spn2-error:no-status | 972 + pdf | not-found | 820 + pdf | bad-redirect | 536 + pdf | read-timeout | 392 + pdf | wayback-error | 251 + pdf | remote-server-error | 220 + (20 rows) + +Hrm, that is a healthy fraction of `no-pdf-link`. + +Broken domains, past 30 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + -------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 39678 + osf.io | gateway-timeout | 29809 + acervus.unicamp.br | no-pdf-link | 21978 + osf.io | terminal-bad-status | 18727 + zenodo.org | spn2-cdx-lookup-failure | 17008 + doi.org | spn2-cdx-lookup-failure | 15503 + www.degruyter.com | no-pdf-link | 15122 + ieeexplore.ieee.org | spn2-error:job-failed | 12921 + osf.io | spn2-cdx-lookup-failure | 11123 + www.tandfonline.com | blocked-cookie | 8096 + www.morressier.com | no-pdf-link | 4655 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 4580 + pubs.acs.org | blocked-cookie | 4415 + www.frontiersin.org | no-pdf-link | 4163 + www.degruyter.com | spn2-cdx-lookup-failure | 3788 + www.taylorfrancis.com | no-pdf-link | 3568 + www.sciencedirect.com | no-pdf-link | 3128 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 3116 + acervus.unicamp.br | spn2-cdx-lookup-failure | 2797 + www.mdpi.com | spn2-cdx-lookup-failure | 2719 + brill.com | link-loop | 2681 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 2657 + www.sciencedirect.com | spn2-cdx-lookup-failure | 2546 + apps.crossref.org | no-pdf-link | 2537 + onlinelibrary.wiley.com | blocked-cookie | 2528 + (25 rows) + +Summary of significant domains and status, past 30 days, minus spn2-cdx-lookup-failure: + + SELECT domain, status, count + FROM ( + SELECT domain, status, COUNT((domain, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + AND ingest_file_result.status != 'spn2-cdx-lookup-failure' + ) t1 + WHERE t1.domain != '' + GROUP BY CUBE (domain, status) + ) t2 + WHERE count > 200 + ORDER BY domain ASC , count DESC; + + + domain | status | count + -----------------------------------------------------------------+-----------------------+-------- + academic.oup.com | | 2405 + academic.oup.com | no-pdf-link | 1240 + academic.oup.com | link-loop | 1010 + acervus.unicamp.br | | 21980 + acervus.unicamp.br | no-pdf-link | 21978 ** + aclanthology.org | | 208 + acp.copernicus.org | | 365 + acp.copernicus.org | success | 356 + aip.scitation.org | | 1071 + aip.scitation.org | blocked-cookie | 843 + aip.scitation.org | redirect-loop | 227 + apps.crossref.org | | 2537 + apps.crossref.org | no-pdf-link | 2537 + arxiv.org | | 17817 + arxiv.org | success | 17370 + arxiv.org | terminal-bad-status | 320 + asmedigitalcollection.asme.org | | 401 + asmedigitalcollection.asme.org | link-loop | 364 + assets.researchsquare.com | | 3706 + assets.researchsquare.com | success | 3706 + avmj.journals.ekb.eg | | 605 + avmj.journals.ekb.eg | success | 595 + bfa.journals.ekb.eg | | 224 + bfa.journals.ekb.eg | success | 214 + biorxiv.org | redirect-loop | 895 + biorxiv.org | | 895 + birdsoftheworld.org | | 286 + birdsoftheworld.org | no-pdf-link | 285 + bmjopen.bmj.com | success | 232 + bmjopen.bmj.com | | 232 + books.openedition.org | | 396 + books.openedition.org | no-pdf-link | 396 + brill.com | | 4272 + brill.com | link-loop | 2681 + brill.com | no-pdf-link | 1410 + cas.columbia.edu | | 1038 + cas.columbia.edu | no-pdf-link | 1038 ** + cdr.lib.unc.edu | | 513 + cdr.lib.unc.edu | success | 469 + chemrxiv.org | | 278 + chemrxiv.org | success | 275 + classiques-garnier.com | | 531 + classiques-garnier.com | no-pdf-link | 487 * + content.iospress.com | | 275 + content.iospress.com | link-loop | 230 + cris.maastrichtuniversity.nl | | 318 + cris.maastrichtuniversity.nl | success | 284 + cyberleninka.ru | | 1165 + cyberleninka.ru | success | 1134 + deepblue.lib.umich.edu | | 289 + dergipark.org.tr | | 1185 + dergipark.org.tr | success | 774 + dergipark.org.tr | no-pdf-link | 320 + didaktorika.gr | | 688 + didaktorika.gr | redirect-loop | 688 + digi.ub.uni-heidelberg.de | | 292 + digi.ub.uni-heidelberg.de | no-pdf-link | 292 + direct.mit.edu | | 236 + direct.mit.edu | no-pdf-link | 207 * + dl.acm.org | | 2319 + dl.acm.org | blocked-cookie | 2230 + dmtcs.episciences.org | | 733 + dmtcs.episciences.org | success | 730 + doi.ala.org.au | no-pdf-link | 2373 ** + doi.ala.org.au | | 2373 + doi.org | | 732 + doi.org | terminal-bad-status | 673 + downloads.hindawi.com | success | 1452 + downloads.hindawi.com | | 1452 + drive.google.com | | 216 + drive.google.com | no-pdf-link | 211 + dtb.bmj.com | | 674 + dtb.bmj.com | link-loop | 669 + easy.dans.knaw.nl | no-pdf-link | 261 * + easy.dans.knaw.nl | | 261 + ebooks.marilia.unesp.br | | 688 + ebooks.marilia.unesp.br | no-pdf-link | 688 * + ehp.niehs.nih.gov | | 766 + ehp.niehs.nih.gov | blocked-cookie | 765 + ejournal.mandalanursa.org | | 307 + ejournal.mandalanursa.org | success | 305 + elib.spbstu.ru | | 264 + elib.spbstu.ru | redirect-loop | 257 + elibrary.ru | | 1367 + elibrary.ru | redirect-loop | 1169 + elibrary.vdi-verlag.de | | 1251 + elibrary.vdi-verlag.de | no-pdf-link | 646 + elibrary.vdi-verlag.de | link-loop | 537 + elifesciences.org | | 328 + elifesciences.org | success | 323 + figshare.com | | 803 + figshare.com | no-pdf-link | 714 * + files.osf.io | | 745 + files.osf.io | success | 614 + hammer.purdue.edu | | 244 + hammer.purdue.edu | no-pdf-link | 243 + heiup.uni-heidelberg.de | | 277 + heiup.uni-heidelberg.de | no-pdf-link | 268 + hkvalidate.perfdrive.com | no-pdf-link | 370 * + hkvalidate.perfdrive.com | | 370 + ieeexplore.ieee.org | | 16675 + ieeexplore.ieee.org | spn2-error:job-failed | 12927 + ieeexplore.ieee.org | success | 1952 + ieeexplore.ieee.org | too-many-redirects | 1193 + ieeexplore.ieee.org | no-pdf-link | 419 + jamanetwork.com | | 339 + jamanetwork.com | success | 216 + jmstt.ntou.edu.tw | | 244 + jmstt.ntou.edu.tw | success | 241 + journal.ipb.ac.id | | 229 + journal.ipb.ac.id | success | 206 + journal.nafe.org | | 221 + journals.aps.org | | 614 + journals.aps.org | gateway-timeout | 495 + journals.asm.org | | 463 + journals.asm.org | blocked-cookie | 435 + journals.flvc.org | | 230 + journals.lww.com | | 1300 + journals.lww.com | link-loop | 1284 + journals.openedition.org | | 543 + journals.openedition.org | success | 311 + journals.ub.uni-heidelberg.de | | 357 + journals.ub.uni-heidelberg.de | success | 311 + jov.arvojournals.org | | 431 + jov.arvojournals.org | no-pdf-link | 422 * + kiss.kstudy.com | | 303 + kiss.kstudy.com | no-pdf-link | 303 * + library.iated.org | | 364 + library.iated.org | redirect-loop | 264 + library.seg.org | blocked-cookie | 301 + library.seg.org | | 301 + link.aps.org | redirect-loop | 442 + link.aps.org | | 442 + linkinghub.elsevier.com | | 515 + linkinghub.elsevier.com | gateway-timeout | 392 + mc.sbm.org.br | | 224 + mc.sbm.org.br | success | 224 + mdpi-res.com | | 742 + mdpi-res.com | success | 742 + mdsoar.org | | 220 + mediarep.org | | 269 + mediarep.org | success | 264 + medrxiv.org | redirect-loop | 290 + medrxiv.org | | 290 + muse.jhu.edu | | 429 + muse.jhu.edu | terminal-bad-status | 391 + mvmj.journals.ekb.eg | | 306 + oapub.org | | 292 + oapub.org | success | 289 + onepetro.org | | 426 + onepetro.org | link-loop | 406 + onlinelibrary.wiley.com | | 2835 + onlinelibrary.wiley.com | blocked-cookie | 2531 + onlinelibrary.wiley.com | redirect-loop | 264 + open.library.ubc.ca | | 569 + open.library.ubc.ca | no-pdf-link | 425 * + opendata.uni-halle.de | | 407 + opendata.uni-halle.de | success | 263 + osf.io | | 49022 + osf.io | gateway-timeout | 29810 + osf.io | terminal-bad-status | 18731 + osf.io | spn2-error | 247 + osf.io | not-found | 205 + oxford.universitypressscholarship.com | | 392 + oxford.universitypressscholarship.com | link-loop | 233 + panor.ru | no-pdf-link | 433 * + panor.ru | | 433 + papers.ssrn.com | | 1630 + papers.ssrn.com | link-loop | 1598 + pdf.sciencedirectassets.com | | 3063 + pdf.sciencedirectassets.com | success | 3063 + peerj.com | | 464 + peerj.com | no-pdf-link | 303 * + periodicos.ufpe.br | | 245 + periodicos.ufpe.br | success | 232 + periodicos.unb.br | | 230 + periodicos.unb.br | success | 221 + preprints.jmir.org | | 548 + preprints.jmir.org | cdx-error | 499 + publications.rwth-aachen.de | | 213 + publikationen.bibliothek.kit.edu | | 346 + publikationen.bibliothek.kit.edu | success | 314 + publikationen.uni-tuebingen.de | | 623 + publikationen.uni-tuebingen.de | no-pdf-link | 522 * + publons.com | no-pdf-link | 934 * + publons.com | | 934 + pubs.acs.org | | 4507 + pubs.acs.org | blocked-cookie | 4406 + pubs.rsc.org | | 1638 + pubs.rsc.org | link-loop | 1054 + pubs.rsc.org | redirect-loop | 343 + pubs.rsc.org | success | 201 + repositorio.ufu.br | | 637 + repositorio.ufu.br | success | 607 + repository.dri.ie | | 1852 + repository.dri.ie | no-pdf-link | 1852 ** + repository.library.brown.edu | | 293 + repository.library.brown.edu | no-pdf-link | 291 * + res.mdpi.com | | 10367 + res.mdpi.com | success | 10360 + retrovirology.biomedcentral.com | | 230 + revistas.ufrj.br | | 284 + revistas.ufrj.br | success | 283 + revistas.uptc.edu.co | | 385 + revistas.uptc.edu.co | success | 344 + royalsocietypublishing.org | | 231 + rsdjournal.org | | 347 + rsdjournal.org | success | 343 + s3-ap-southeast-2.amazonaws.com | | 400 + s3-ap-southeast-2.amazonaws.com | success | 392 + s3-eu-west-1.amazonaws.com | | 2096 + s3-eu-west-1.amazonaws.com | success | 2091 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 289 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 286 + s3.ca-central-1.amazonaws.com | | 202 + sage.figshare.com | | 242 + sage.figshare.com | no-pdf-link | 241 + sajeb.org | | 246 + sajeb.org | no-pdf-link | 243 + scholar.dkyobobook.co.kr | | 332 + scholar.dkyobobook.co.kr | no-pdf-link | 328 * + search.mandumah.com | | 735 + search.mandumah.com | redirect-loop | 726 + secure.jbs.elsevierhealth.com | | 1112 + secure.jbs.elsevierhealth.com | blocked-cookie | 1108 + stm.bookpi.org | no-pdf-link | 468 * + stm.bookpi.org | | 468 + storage.googleapis.com | | 1012 + storage.googleapis.com | success | 1012 + tandf.figshare.com | | 469 + tandf.figshare.com | no-pdf-link | 466 + teses.usp.br | | 739 + teses.usp.br | success | 730 + tidsskrift.dk | | 360 + tidsskrift.dk | success | 346 + tiedejaedistys.journal.fi | | 224 + tind-customer-agecon.s3.amazonaws.com | success | 332 + tind-customer-agecon.s3.amazonaws.com | | 332 + valep.vc.univie.ac.at | no-pdf-link | 280 + valep.vc.univie.ac.at | | 280 + watermark.silverchair.com | | 1729 + watermark.silverchair.com | success | 1719 + www.academia.edu | | 387 + www.academia.edu | no-pdf-link | 386 + www.ahajournals.org | | 430 + www.ahajournals.org | blocked-cookie | 413 + www.atenaeditora.com.br | | 572 + www.atenaeditora.com.br | terminal-bad-status | 513 + www.atlantis-press.com | success | 722 + www.atlantis-press.com | | 722 + www.aup-online.com | | 419 + www.aup-online.com | no-pdf-link | 419 * + www.beck-elibrary.de | | 269 + www.beck-elibrary.de | no-pdf-link | 268 * + www.biodiversitylibrary.org | no-pdf-link | 528 * + www.biodiversitylibrary.org | | 528 + www.bloomsburycollections.com | | 623 + www.bloomsburycollections.com | no-pdf-link | 605 * + www.cabi.org | | 2191 + www.cabi.org | no-pdf-link | 2186 * + www.cairn.info | | 1283 + www.cairn.info | no-pdf-link | 713 + www.cairn.info | link-loop | 345 + www.cambridge.org | | 4128 + www.cambridge.org | no-pdf-link | 1531 + www.cambridge.org | success | 1441 + www.cambridge.org | link-loop | 971 + www.cureus.com | no-pdf-link | 526 * + www.cureus.com | | 526 + www.dbpia.co.kr | | 637 + www.dbpia.co.kr | redirect-loop | 631 + www.deboni.he.com.br | | 382 + www.deboni.he.com.br | success | 381 + www.degruyter.com | | 17783 + www.degruyter.com | no-pdf-link | 15102 + www.degruyter.com | success | 2584 + www.dovepress.com | | 480 + www.dovepress.com | success | 472 + www.e-manuscripta.ch | | 1350 + www.e-manuscripta.ch | no-pdf-link | 1350 * + www.e-periodica.ch | | 1276 + www.e-periodica.ch | no-pdf-link | 1275 + www.e-rara.ch | | 202 + www.e-rara.ch | no-pdf-link | 202 + www.elgaronline.com | | 495 + www.elgaronline.com | link-loop | 290 + www.elibrary.ru | | 922 + www.elibrary.ru | no-pdf-link | 904 + www.emerald.com | | 2155 + www.emerald.com | no-pdf-link | 1936 * + www.emerald.com | success | 219 + www.eurekaselect.com | | 518 + www.eurekaselect.com | no-pdf-link | 516 * + www.frontiersin.org | | 4163 + www.frontiersin.org | no-pdf-link | 4162 ** + www.hanser-elibrary.com | | 444 + www.hanser-elibrary.com | blocked-cookie | 444 + www.hanspub.org | | 334 + www.hanspub.org | no-pdf-link | 314 + www.idunn.no | | 1736 + www.idunn.no | link-loop | 596 + www.idunn.no | success | 577 + www.idunn.no | no-pdf-link | 539 + www.igi-global.com | terminal-bad-status | 458 + www.igi-global.com | | 458 + www.ijcai.org | | 533 + www.ijcai.org | success | 532 + www.ijraset.com | success | 385 + www.ijraset.com | | 385 + www.inderscience.com | | 712 + www.inderscience.com | no-pdf-link | 605 * + www.ingentaconnect.com | | 456 + www.ingentaconnect.com | no-pdf-link | 413 * + www.internationaljournalssrg.org | | 305 + www.internationaljournalssrg.org | no-pdf-link | 305 * + www.isca-speech.org | | 2392 + www.isca-speech.org | no-pdf-link | 2391 ** + www.journals.uchicago.edu | | 228 + www.journals.uchicago.edu | blocked-cookie | 227 + www.jstage.jst.go.jp | | 1492 + www.jstage.jst.go.jp | success | 1185 + www.jstage.jst.go.jp | no-pdf-link | 289 + www.jstor.org | | 301 + www.jurology.com | | 887 + www.jurology.com | redirect-loop | 887 + www.karger.com | | 318 + www.liebertpub.com | | 507 + www.liebertpub.com | blocked-cookie | 496 + www.morressier.com | | 4781 + www.morressier.com | no-pdf-link | 4655 ** + www.ncl.ecu.edu | | 413 + www.ncl.ecu.edu | success | 413 + www.nomos-elibrary.de | | 526 + www.nomos-elibrary.de | no-pdf-link | 391 + www.oecd-ilibrary.org | no-pdf-link | 1170 ** + www.oecd-ilibrary.org | | 1170 + www.openagrar.de | no-pdf-link | 221 + www.openagrar.de | | 221 + www.osapublishing.org | | 900 + www.osapublishing.org | link-loop | 615 + www.osapublishing.org | no-pdf-link | 269 + www.osti.gov | | 630 + www.osti.gov | link-loop | 573 + www.oxfordlawtrove.com | no-pdf-link | 476 * + www.oxfordlawtrove.com | | 476 + www.pdcnet.org | | 298 + www.pdcnet.org | terminal-bad-status | 262 + www.pedocs.de | | 203 + www.pnas.org | | 222 + www.preprints.org | | 372 + www.preprints.org | success | 366 + www.repository.cam.ac.uk | | 801 + www.repository.cam.ac.uk | success | 359 + www.repository.cam.ac.uk | no-pdf-link | 239 + www.research-collection.ethz.ch | | 276 + www.research-collection.ethz.ch | terminal-bad-status | 274 + www.revistas.usp.br | | 207 + www.revistas.usp.br | success | 204 + www.rina.org.uk | no-pdf-link | 1009 ** + www.rina.org.uk | | 1009 + www.schweizerbart.de | no-pdf-link | 202 + www.schweizerbart.de | | 202 + www.scielo.br | | 544 + www.scielo.br | redirect-loop | 526 + www.sciencedirect.com | | 3901 + www.sciencedirect.com | no-pdf-link | 3127 ** + www.sciencedirect.com | link-loop | 701 + www.sciendo.com | | 384 + www.sciendo.com | success | 363 + www.sciengine.com | | 225 + www.scirp.org | | 209 + www.spandidos-publications.com | | 205 + www.tandfonline.com | | 8925 + www.tandfonline.com | blocked-cookie | 8099 + www.tandfonline.com | terminal-bad-status | 477 + www.tandfonline.com | redirect-loop | 322 + www.taylorfrancis.com | | 6119 + www.taylorfrancis.com | no-pdf-link | 3567 + www.taylorfrancis.com | link-loop | 2169 + www.taylorfrancis.com | terminal-bad-status | 353 + www.thieme-connect.de | | 1047 + www.thieme-connect.de | redirect-loop | 472 + www.thieme-connect.de | spn2-error:job-failed | 343 + www.tib.eu | | 206 + www.trp.org.in | | 311 + www.trp.org.in | success | 311 + www.un-ilibrary.org | no-pdf-link | 597 * + www.un-ilibrary.org | | 597 + www.vr-elibrary.de | | 775 + www.vr-elibrary.de | blocked-cookie | 774 + www.wjgnet.com | | 204 + www.wjgnet.com | no-pdf-link | 204 + www.worldscientific.com | | 974 + www.worldscientific.com | blocked-cookie | 971 + www.worldwidejournals.com | | 242 + www.worldwidejournals.com | no-pdf-link | 203 + www.wto-ilibrary.org | no-pdf-link | 295 + www.wto-ilibrary.org | | 295 + www.zora.uzh.ch | | 222 + zenodo.org | | 49460 + zenodo.org | no-pdf-link | 39721 + zenodo.org | success | 8954 + zenodo.org | wrong-mimetype | 562 + | | 445919 + | no-pdf-link | 168035 + | success | 140875 + | gateway-timeout | 31809 + | blocked-cookie | 26431 + | terminal-bad-status | 25625 + | link-loop | 19006 + | spn2-error:job-failed | 13962 + | redirect-loop | 12512 + | wrong-mimetype | 2302 + | spn2-error | 1689 + | too-many-redirects | 1203 + | bad-redirect | 732 + | cdx-error | 539 + | not-found | 420 + | spn2-error:no-status | 256 + (419 rows) + +Get random subsets by terminal domain: + + \x auto + SELECT + ingest_request.link_source_id AS link_source_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_file_result.terminal_url LIKE '%//DOMAIN/%' + ORDER BY random() + LIMIT 5; + +## acervus.unicamp.br + +Previously flagged as messy (2021-05_daily_improvements.md) + +## cas.columbia.edu + +-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-2ety-qm51 +base_url | https://doi.org/10.7916/d8-2ety-qm51 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-0zf6-d167 +base_url | https://doi.org/10.7916/d8-0zf6-d167 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-k6ha-sn43 +base_url | https://doi.org/10.7916/d8-k6ha-sn43 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-bj6t-eb07 +base_url | https://doi.org/10.7916/d8-bj6t-eb07 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-xjac-j502 +base_url | https://doi.org/10.7916/d8-xjac-j502 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback + +these are not public (loginwalls) + +DONE: '/login?TARGET=' as a login wall pattern + +## doi.ala.org.au + +Previously flagged as dataset repository; datacite metadata is wrong. (2021-05_daily_improvements.md) + +NOTE: look at ingesting datasets + +## www.isca-speech.org + +-[ RECORD 1 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2014-84 +base_url | https://doi.org/10.21437/interspeech.2014-84 +terminal_url | https://www.isca-speech.org/archive/interspeech_2014/li14b_interspeech.html +-[ RECORD 2 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2004-319 +base_url | https://doi.org/10.21437/interspeech.2004-319 +terminal_url | https://www.isca-speech.org/archive/interspeech_2004/delcroix04_interspeech.html +-[ RECORD 3 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2006-372 +base_url | https://doi.org/10.21437/interspeech.2006-372 +terminal_url | https://www.isca-speech.org/archive/interspeech_2006/lei06c_interspeech.html +-[ RECORD 4 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2015-588 +base_url | https://doi.org/10.21437/interspeech.2015-588 +terminal_url | https://www.isca-speech.org/archive/interspeech_2015/polzehl15b_interspeech.html +-[ RECORD 5 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2006-468 +base_url | https://doi.org/10.21437/interspeech.2006-468 +terminal_url | https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html + +Bespoke site. Added rule to sandcrawler. + +NOTE: re-ingest/recrawl all isca-speech.org no-pdf-link terminal URLs (fatcat-ingest?) + +## www.morressier.com + + +-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0002858v +base_url | https://doi.org/10.1115/1.0002858v +terminal_url | https://www.morressier.com/article/development-new-single-highdensity-heatflux-gauges-unsteady-heat-transfer-measurements-rotating-transonic-turbine/60f162805d86378f03b49af5 +-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0003896v +base_url | https://doi.org/10.1115/1.0003896v +terminal_url | https://www.morressier.com/article/experimental-investigation-proton-exchange-membrane-fuel-cell-platinum-nafion-along-inplane-direction/60f16d555d86378f03b50038 +-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0004476v +base_url | https://doi.org/10.1115/1.0004476v +terminal_url | https://www.morressier.com/article/effect-air-release-agents-performance-results-fabric-lined-bushings/60f16d585d86378f03b502d5 +-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0001286v +base_url | https://doi.org/10.1115/1.0001286v +terminal_url | https://www.morressier.com/article/development-verification-modelling-practice-cfd-calculations-obtain-current-loads-fpso/60f15d3fe537565438d70ece +-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0000315v +base_url | https://doi.org/10.1115/1.0000315v +terminal_url | https://www.morressier.com/article/fire-event-analysis-fire-frequency-estimation-japanese-nuclear-power-plant/60f15a6f5d86378f03b43874 + +Many of these seem to be presentations, as both video and slides. PDFs seem broken though. + +NOTE: add to list of interesting rich media to crawl/preserve (video+slides+data) + +## www.oecd-ilibrary.org + +Paywall (2021-05_daily_improvements.md) + +## www.rina.org.uk + +-[ RECORD 1 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.ws.2002.10 +base_url | https://doi.org/10.3940/rina.ws.2002.10 +terminal_url | https://www.rina.org.uk/showproducts.html?product=4116 +-[ RECORD 2 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.pass.2003.16 +base_url | https://doi.org/10.3940/rina.pass.2003.16 +terminal_url | https://www.rina.org.uk/showproducts.html?product=3566 +-[ RECORD 3 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.icsotin.2013.15 +base_url | https://doi.org/10.3940/rina.icsotin.2013.15 +terminal_url | https://www.rina.org.uk/showproducts.html?product=8017 +-[ RECORD 4 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.wfa.2010.23 +base_url | https://doi.org/10.3940/rina.wfa.2010.23 +terminal_url | https://www.rina.org.uk/showproducts.html?product=8177 +-[ RECORD 5 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.icsotin15.2015.01 +base_url | https://doi.org/10.3940/rina.icsotin15.2015.01 +terminal_url | https://www.rina.org.uk/showproducts.html?product=7883 + +Site is broken in some way + +## www.sciencedirect.com + +-[ RECORD 1 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/j.jhlste.2021.100332 +base_url | https://doi.org/10.1016/j.jhlste.2021.100332 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S1473837621000332 +-[ RECORD 2 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/j.hazadv.2021.100006 +base_url | https://doi.org/10.1016/j.hazadv.2021.100006 +terminal_url | https://www.sciencedirect.com/science/article/pii/S2772416621000061/pdfft?md5=e51bfd495bb53073c7a379d25cb11a32&pid=1-s2.0-S2772416621000061-main.pdf +-[ RECORD 3 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/b978-0-12-822844-9.00009-8 +base_url | https://doi.org/10.1016/b978-0-12-822844-9.00009-8 +terminal_url | https://www.sciencedirect.com/science/article/pii/B9780128228449000098 +-[ RECORD 4 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/j.colcom.2021.100490 +base_url | https://doi.org/10.1016/j.colcom.2021.100490 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S2215038221001308 +-[ RECORD 5 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/b978-0-323-85245-6.00012-6 +base_url | https://doi.org/10.1016/b978-0-323-85245-6.00012-6 +terminal_url | https://www.sciencedirect.com/science/article/pii/B9780323852456000126 + +These no-pdf-url ones seem to just be not OA, which is expected for much of the +domain. + +## repository.dri.ie + + link_source_id | base_url | terminal_url +-----------------------+---------------------------------------+--------------------------------------------- + 10.7486/dri.t148v5941 | https://doi.org/10.7486/dri.t148v5941 | https://repository.dri.ie/catalog/t148v5941 + 10.7486/dri.2z119c98f | https://doi.org/10.7486/dri.2z119c98f | https://repository.dri.ie/catalog/2z119c98f + 10.7486/dri.qf8621102 | https://doi.org/10.7486/dri.qf8621102 | https://repository.dri.ie/catalog/qf8621102 + 10.7486/dri.js95m457t | https://doi.org/10.7486/dri.js95m457t | https://repository.dri.ie/catalog/js95m457t + 10.7486/dri.c534vb726 | https://doi.org/10.7486/dri.c534vb726 | https://repository.dri.ie/catalog/c534vb726 + +"Digital repository of Ireland" + +Historical scanned content. Bespoke site. Fixed. + +NOTE: recrawl/retry this domain + +## www.frontiersin.org + +-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/978-2-88971-147-5 +base_url | https://doi.org/10.3389/978-2-88971-147-5 +terminal_url | https://www.frontiersin.org/research-topics/9081/neuroimaging-approaches-to-the-study-of-tinnitus-and-hyperacusis +-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fnins.2021.722592 +base_url | https://doi.org/10.3389/fnins.2021.722592 +terminal_url | https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full +-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fcell.2021.683209 +base_url | https://doi.org/10.3389/fcell.2021.683209 +terminal_url | https://www.frontiersin.org/articles/10.3389/fcell.2021.683209/full +-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fmicb.2021.692474 +base_url | https://doi.org/10.3389/fmicb.2021.692474 +terminal_url | https://www.frontiersin.org/articles/10.3389/fmicb.2021.692474/full +-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fneur.2021.676527 +base_url | https://doi.org/10.3389/fneur.2021.676527 +terminal_url | https://www.frontiersin.org/articles/10.3389/fneur.2021.676527/full + +All the `/research-topics/` URLs are out of scope. + +NOTE: recrawl missing frontiersin.org articles for PDFs +NOTE: recrawl missing frontiersin.org articles for XML (?) + +------- + +## direct.mit.edu + +Previously "not available" (2021-05_daily_improvements.md) + +## figshare.com + +-[ RECORD 1 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.15052236.v6 +base_url | https://doi.org/10.6084/m9.figshare.15052236.v6 +terminal_url | https://figshare.com/articles/software/RCL-tree_rar/15052236/6 +-[ RECORD 2 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.14907846.v5 +base_url | https://doi.org/10.6084/m9.figshare.14907846.v5 +terminal_url | https://figshare.com/articles/book/Conservation_of_Limestone_Ecosystems_of_Malaysia_Part_I_Acknowledgements_Methodology_Overview_of_limestone_outcrops_in_Malaysia_References_Detailed_information_on_limestone_outcrops_of_the_states_Johor_Negeri_Sembilan_Terengganu_Selangor_Pe/14907846/5 +-[ RECORD 3 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.15157614.v1 +base_url | https://doi.org/10.6084/m9.figshare.15157614.v1 +terminal_url | https://figshare.com/articles/software/code_for_NN-A72265C/15157614/1 +-[ RECORD 4 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.15172926.v1 +base_url | https://doi.org/10.6084/m9.figshare.15172926.v1 +terminal_url | https://figshare.com/articles/preprint/History_of_the_internet/15172926/1 +-[ RECORD 5 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.16532574.v1 +base_url | https://doi.org/10.6084/m9.figshare.16532574.v1 +terminal_url | https://figshare.com/articles/media/Helen_McConnell_How_many_trees_do_you_think_you_have_planted_/16532574/1 + +NOTE: can determine from the redirect URL, I guess. This is helpful for ingest! +Could also potentially correct fatcat release_type using this info. + +We seem to be getting the ones we can (eg, papers) just fine + +## hkvalidate.perfdrive.com + +Should be skipping/bailing on this domain, but not for some reason. + +-[ RECORD 1 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.3847/1538-4357/ac05cc +base_url | https://doi.org/10.3847/1538-4357/ac05cc +terminal_url | https://hkvalidate.perfdrive.com/?ssa=1716a049-aeaa-4a89-8f82-bd733adaa2e7&ssb=43981203877&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05cc&ssi=0774dd12-8427-4e27-a2ac-759c8cc2ec0e&ssk=support@shieldsquare.com&ssm=07370915269044035109047683305266&ssn=e69c743cc3d66619f960f924b562160d637e8d7f1b0f-d3bb-44d4-b075ed&sso=75a8bd85-4a097fb40f99bfb9c97b0a4ca0a38fd6d79513a466e82cc7&ssp=92054607321628531005162856888275586&ssq=33809984098158010864140981653938424553916&ssr=MjA3LjI0MS4yMjUuMTM5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 2 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.3847/1538-4357/ac0429 +base_url | https://doi.org/10.3847/1538-4357/ac0429 +terminal_url | https://hkvalidate.perfdrive.com/?ssa=12bca70d-0af4-4241-9c9b-384befd96a88&ssb=92559232428&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac0429&ssi=cff72ab0-8427-4acd-a0e7-db1b04cf7ce7&ssk=support@shieldsquare.com&ssm=27895673282814430105287068829605&ssn=9af36a8e10efd239c9367a2f31dde500f7455c4d5f45-bf11-4b99-ad29ea&sso=26bd22d2-b23e1bd9558f2fd9ed0768ef1acecb24715d1d463328a229&ssp=16502500621628222613162823304820671&ssq=11469693950387070477339503456478590533604&ssr=MjA3LjI0MS4yMjUuMTYw&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 3 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1149/1945-7111/ac1a85 +base_url | https://doi.org/10.1149/1945-7111/ac1a85 +terminal_url | https://hkvalidate.perfdrive.com/?ssa=b0fef51a-0f44-476e-b951-3341bde6aa67&ssb=84929220393&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.1149%2F1945-7111%2Fac1a85&ssi=48c05577-8427-4421-acd3-735ca29a46e6&ssk=support@shieldsquare.com&ssm=81129482524077974103852241068134&ssn=cf6c261d2b20d518b2ebe57e40ffaec9ab4cd1955dcb-7877-4f5b-bc3b1e&sso=1d196cae-6850f1ed8143e460f2bfbb61a8ae15cfe6b53d3bcdc528ca&ssp=99289867941628195224162819241830491&ssq=16897595632212421273956322948987630170313&ssr=MjA3LjI0MS4yMjUuMjM2&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 4 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.35848/1882-0786/ac1b0d +base_url | https://doi.org/10.35848/1882-0786/ac1b0d +terminal_url | https://hkvalidate.perfdrive.com/?ssa=6debdd23-c46b-4b40-b73c-d5540f04454e&ssb=95627212532&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.35848%2F1882-0786%2Fac1b0d&ssi=78b34ff9-8427-4d07-a0db-78a3aa2c7332&ssk=support@shieldsquare.com&ssm=54055111549093989106852695053789&ssn=cb51949e15a02cb99a8d0b57c4d06327b72e8d5c87a8-d006-4ffa-939ffb&sso=1b7fd62d-8107746fe28fca252fd45ffa403937e272bf75b452b68d4a&ssp=77377533171628212164162820021422494&ssq=02679025218797637682252187852000657274192&ssr=MjA3LjI0MS4yMzMuMTIx&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 5 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.3847/1538-4357/ac05ba +base_url | https://doi.org/10.3847/1538-4357/ac05ba +terminal_url | https://hkvalidate.perfdrive.com/?ssa=f127eb3d-6a05-459d-97f2-499715c04b13&ssb=06802230353&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05ba&ssi=8d087719-8427-4046-91fb-5e96af401560&ssk=support@shieldsquare.com&ssm=21056861072205974105064006574997&ssn=d05a73cff6d9af57acd6e2c366e716176752e1164d39-b9a7-408c-837d11&sso=d3f38d1e-a562a19195042d7e471a5e4fab03b6ca16ff1711c7c61804&ssp=68781137401628744693162877909483738&ssq=79454859841502433261398415426689546750534&ssr=MjA3LjI0MS4yMzIuMTg5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= + +Was failing to check against blocklist again at the end of attempts. + +Could retry all these to update status, but probably not worth it. + +## jov.arvojournals.org + + link_source_id | base_url | terminal_url +-----------------------+---------------------------------------+------------------------------------------------------------- + 10.1167/jov.21.9.1933 | https://doi.org/10.1167/jov.21.9.1933 | https://jov.arvojournals.org/article.aspx?articleid=2777021 + 10.1167/jov.21.9.2910 | https://doi.org/10.1167/jov.21.9.2910 | https://jov.arvojournals.org/article.aspx?articleid=2777561 + 10.1167/jov.21.9.1895 | https://doi.org/10.1167/jov.21.9.1895 | https://jov.arvojournals.org/article.aspx?articleid=2777057 + 10.1167/jov.21.9.2662 | https://doi.org/10.1167/jov.21.9.2662 | https://jov.arvojournals.org/article.aspx?articleid=2777793 + 10.1167/jov.21.9.2246 | https://doi.org/10.1167/jov.21.9.2246 | https://jov.arvojournals.org/article.aspx?articleid=2777441 + +These seem to just not be published/available yet. + +But they also use watermark.silverchair.com + +NOTE: re-crawl (force-retry?) all non-recent papers with fatcat-ingest +NOTE: for watermark.silverchair.com terminal bad-status, re-crawl from initial URL (base_url) using heritrix + +## kiss.kstudy.com + +Previously unable to download (2021-05_daily_improvements.md) + +## open.library.ubc.ca + + link_source_id | base_url | terminal_url +--------------------+------------------------------------+---------------------------------------------------------------------------------- + 10.14288/1.0400664 | https://doi.org/10.14288/1.0400664 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400664 + 10.14288/1.0401189 | https://doi.org/10.14288/1.0401189 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401189 + 10.14288/1.0401487 | https://doi.org/10.14288/1.0401487 | https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487 + 10.14288/1.0400994 | https://doi.org/10.14288/1.0400994 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400994 + 10.14288/1.0401312 | https://doi.org/10.14288/1.0401312 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401312 + +Historical newspapers, out of scope? + +Video content: +https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487 + +Another video: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 + +NOTE: add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 +NOTE: handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512 + + +## panor.ru + + link_source_id | base_url | terminal_url +-------------------------+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------- + 10.33920/med-14-2108-06 | https://doi.org/10.33920/med-14-2108-06 | https://panor.ru/articles/otsenka-dinamiki-pokazateley-morfofunktsionalnykh-kharakteristik-kozhi-upatsientov-s-spr-pod-vliyaniem-kompleksnoy-fototerapii/66351.html + 10.33920/nik-02-2105-01 | https://doi.org/10.33920/nik-02-2105-01 | https://panor.ru/articles/innovatsionnost-obrazovatelnykh-tekhnologiy-kak-istoricheski-oposredovannyy-fenomen/65995.html + 10.33920/pro-1-2101-10 | https://doi.org/10.33920/pro-1-2101-10 | https://panor.ru/articles/obespechenie-bezopasnosti-na-promyshlennykh-predpriyatiyakh-s-pomoshchyu-sredstv-individualnoy-zashchity/66299.html + 10.33920/sel-4-2008-04 | https://doi.org/10.33920/sel-4-2008-04 | https://panor.ru/articles/osobennosti-regulirovaniya-zemelnykh-otnosheniy-na-prigranichnykh-territoriyakh-rossiyskoy-federatsii/66541.html + 10.33920/pro-2-2104-03 | https://doi.org/10.33920/pro-2-2104-03 | https://panor.ru/articles/organizatsiya-samorazvivayushchegosya-proizvodstva-v-realnykh-usloviyakh/65054.html + +"The full version of the article is available only to subscribers of the journal" + +Paywall + +## peerj.com + +Previously: this is HTML of reviews (2021-05_daily_improvements.md) + +NOTE: Should be HTML ingest, possibly special case scope + +## publons.com + +Previously: this is HTML (2021-05_daily_improvements.md) + +NOTE: Should be HTML ingest, possibly special case scope (length of works) + +## stm.bookpi.org + + link_source_id | base_url | terminal_url +-----------------------------+---------------------------------------------+---------------------------------------------------- + 10.9734/bpi/nfmmr/v7/11547d | https://doi.org/10.9734/bpi/nfmmr/v7/11547d | https://stm.bookpi.org/NFMMR-V7/article/view/3231 + 10.9734/bpi/ecafs/v1/9773d | https://doi.org/10.9734/bpi/ecafs/v1/9773d | https://stm.bookpi.org/ECAFS-V1/article/view/3096 + 10.9734/bpi/mpebm/v5/3391f | https://doi.org/10.9734/bpi/mpebm/v5/3391f | https://stm.bookpi.org/MPEBM-V5/article/view/3330 + 10.9734/bpi/castr/v13/3282f | https://doi.org/10.9734/bpi/castr/v13/3282f | https://stm.bookpi.org/CASTR-V13/article/view/2810 + 10.9734/bpi/hmms/v13 | https://doi.org/10.9734/bpi/hmms/v13 | https://stm.bookpi.org/HMMS-V13/issue/view/274 + +These are... just abstracts of articles within a book? Weird. Maybe sketchy? DOIs via Crossref + +## www.cabi.org + + link_source_id | base_url | terminal_url +--------------------------+------------------------------------------+---------------------------------------------------- + 10.1079/dfb/20133414742 | https://doi.org/10.1079/dfb/20133414742 | https://www.cabi.org/cabreviews/review/20133414742 + 10.1079/dmpd/20056500471 | https://doi.org/10.1079/dmpd/20056500471 | https://www.cabi.org/cabreviews/review/20056500471 + 10.1079/dmpp/20056600544 | https://doi.org/10.1079/dmpp/20056600544 | https://www.cabi.org/cabreviews/review/20056600544 + 10.1079/dmpd/20056500117 | https://doi.org/10.1079/dmpd/20056500117 | https://www.cabi.org/cabreviews/review/20056500117 + 10.1079/dmpp20056600337 | https://doi.org/10.1079/dmpp20056600337 | https://www.cabi.org/cabreviews/review/20056600337 + +Reviews? but just abstracts? + +## www.cureus.com + +-[ RECORD 1 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.17547 +base_url | https://doi.org/10.7759/cureus.17547 +terminal_url | https://www.cureus.com/articles/69542-tramadol-induced-jerks +-[ RECORD 2 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.16867 +base_url | https://doi.org/10.7759/cureus.16867 +terminal_url | https://www.cureus.com/articles/66793-advanced-squamous-cell-carcinoma-of-gall-bladder-masquerading-as-liver-abscess-with-review-of-literature-review-on-advanced-biliary-tract-cancer +-[ RECORD 3 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.17425 +base_url | https://doi.org/10.7759/cureus.17425 +terminal_url | https://www.cureus.com/articles/67438-attitudes-and-knowledge-of-medical-students-towards-healthcare-for-lesbian-gay-bisexual-and-transgender-seniors-impact-of-a-case-based-discussion-with-facilitators-from-the-community +-[ RECORD 4 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.17313 +base_url | https://doi.org/10.7759/cureus.17313 +terminal_url | https://www.cureus.com/articles/67258-utilizing-google-trends-to-track-online-interest-in-elective-hand-surgery-during-the-covid-19-pandemic +-[ RECORD 5 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.16943 +base_url | https://doi.org/10.7759/cureus.16943 +terminal_url | https://www.cureus.com/articles/19364-small-bowel-obstruction-a-rare-presentation-of-the-inferior-pancreaticoduodenal-artery-pseudoaneurysm-bleed + +Ugh, stupid "email to get PDF". but ingest seems to work anyways? + +NOTE: re-crawl/re-ingest all (eg, fatcat-ingest or similar) + +## www.e-manuscripta.ch + + link_source_id | base_url | terminal_url +------------------------------+----------------------------------------------+------------------------------------------------------------------- + 10.7891/e-manuscripta-114031 | https://doi.org/10.7891/e-manuscripta-114031 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114031 + 10.7891/e-manuscripta-112064 | https://doi.org/10.7891/e-manuscripta-112064 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112064 + 10.7891/e-manuscripta-112176 | https://doi.org/10.7891/e-manuscripta-112176 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176 + 10.7891/e-manuscripta-115200 | https://doi.org/10.7891/e-manuscripta-115200 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-115200 + 10.7891/e-manuscripta-114008 | https://doi.org/10.7891/e-manuscripta-114008 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114008 + +Historical docs, single pages, but do have full PDF downloads. + +NOTE: re-ingest + +## www.inderscience.com + +Previously: paywall (2021-05_daily_improvements.md) + +## www.un-ilibrary.org + + link_source_id | base_url | terminal_url +----------------------------+--------------------------------------------+------------------------------------------------------------- + 10.18356/9789210550307 | https://doi.org/10.18356/9789210550307 | https://www.un-ilibrary.org/content/books/9789210550307 + 10.18356/9789210586719c011 | https://doi.org/10.18356/9789210586719c011 | https://www.un-ilibrary.org/content/books/9789210586719c011 + 10.18356/9789210058575c014 | https://doi.org/10.18356/9789210058575c014 | https://www.un-ilibrary.org/content/books/9789210058575c014 + 10.18356/9789210550307c020 | https://doi.org/10.18356/9789210550307c020 | https://www.un-ilibrary.org/content/books/9789210550307c020 + 10.18356/9789213631423c005 | https://doi.org/10.18356/9789213631423c005 | https://www.un-ilibrary.org/content/books/9789213631423c005 + +Books and chapters. Doesn't seem to have actual download ability? + +# Re-Ingest / Re-Crawl + +Using fatcat-ingest helper tool. + +- www.isca-speech.org doi_prefix:10.21437 + doi:* doi_prefix:10.21437 in_ia:false + 9,233 + ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.21437' > /srv/fatcat/tasks/2021-09-03_ingest_isca.json + => Counter({'ingest_request': 9221, 'elasticsearch_release': 9221, 'estimate': 9221}) +- repository.dri.ie doi_prefix:10.7486 + doi:* in_ia:false doi_prefix:10.7486 + 56,532 + ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.7486' > /srv/fatcat/tasks/2021-09-03_ingest_dri.json + => Counter({'ingest_request': 56532, 'elasticsearch_release': 56532, 'estimate': 56532}) +- *.arvojournals.org doi_prefix:10.1167 (force recrawl if no-pdf-link) + 25,598 + many are meeting abstracts + ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.1167 > /srv/fatcat/tasks/2021-09-03_ingest_arvo.json + => Counter({'ingest_request': 25598, 'elasticsearch_release': 25598, 'estimate': 25598}) +- www.cureus.com doi_prefix:10.7759 + 1,537 + ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.7759 > /srv/fatcat/tasks/2021-09-03_ingest_cureus.json + => Counter({'ingest_request': 1535, 'elasticsearch_release': 1535, 'estimate': 1535}) +- www.e-manuscripta.ch doi_prefix:10.7891 10.7891/e-manuscripta + 110,945 + TODO: all are marked 'unpublished', but that is actually probably right? +- www.frontiersin.org doi_prefix:10.3389 (both PDF and XML!) + doi:* in_ia:false doi_prefix:10.3389 + 212,370 + doi:10.3389/conf.* => most seem to be just abstracts? how many like this? + container_id:kecnf6vtpngn7j2avgfpdyw5ym => "topics" (2.2k) + fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz + => 191k + but many might be components? this is actually kind of a mess + fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz + => 19.2k + ./fatcat_ingest.py --allow-non-oa query 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' | rg -v 10.3389/conf > /srv/fatcat/tasks/2021-09-03_frontiers.json + +# Remaining Tasks / Domains (TODO) + +more complex crawling/content: +- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 +- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url +- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data) +- doi.ala.org.au: possible dataset ingest source +- peerj.com, at least reviews, should be HTML ingest? or are some PDF? +- publons.com should be HTML ingest, possibly special case for scope +- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug + +other tasks: +- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512 +- push/deploy sandcrawler changes diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md new file mode 100644 index 0000000..d36f427 --- /dev/null +++ b/notes/ingest/2021-09-03_patch_crawl.md @@ -0,0 +1,678 @@ + +Going to run a combined crawl for `no-capture`, `no-pdf-link` and similar URL +statuses. + +As a reminder, significant refactor of PDF URL extraction happened around +Oct/Nov 2020, so things not re-ingested since then should be retried. + +1. first bulk re-process `no-pdf-link` statuses from OAI-PMH crawl past OA DOI past crawls +2. then heritrix crawl of old URLs from all sources (see status codes below) +3. bulk ingest specific sources and statuses (see below) + +Status codes to crawl, with potentially split separate batches: + + no-capture + IA errors + cdx-error + wayback-error + wayback-content-error + petabox-error + spn2-cdx-lookup-failure + gateway-timeout + +Then, bulk ingest from these sources matching the above patterns, in this order: + +- OA DOI (fatcat-ingest or fatcat-changelog source; will result in import) +- unpaywall (will result in import) +- OAI-PMH +- MAG + +Current combined domain skip list (SQL filter syntax), for which we don't want +to bother retrying: + + '%journals.sagepub.com%' + '%pubs.acs.org%' + '%ahajournals.org%' + '%www.journal.csj.jp%' + '%aip.scitation.org%' + '%academic.oup.com%' + '%tandfonline.com%' + '%://orcid.org/%' + '%://doaj.org/%' + '%://archive.org/%' + '%://web.archive.org/%' + '%://www.archive.org/%' + +## DOI Ingest Status (2021-09-08) + +Recently did some analysis of OAI-PMH overall status, so can re-do comparisons +there easily. What about overall DOI ingest? Would like counts so we can +compare before/after. + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------+---------- + no-pdf-link | 10516478 + success | 5690862 + redirect-loop | 1827192 + no-capture | 1215179 + terminal-bad-status | 650104 + link-loop | 610251 + blocked-cookie | 353681 + gateway-timeout | 341319 + too-many-redirects | 307895 + forbidden | 306710 + spn2-cdx-lookup-failure | 282955 + not-found | 273667 + cdx-error | 269082 + skip-url-blocklist | 265689 + spn2-error | 87759 + wrong-mimetype | 68993 + spn2-error:too-many-redirects | 58064 + wayback-error | 54152 + spn2-wayback-error | 51752 + remote-server-error | 45683 + (20 rows) + +## `no-pdf-link` re-try bulk ingest + +Specifically for past OAI-PMH and OA DOI crawls. + +What are top terminal domains that would be retried? So that we can filter out +large ones we don't want to bother retrying. + + SELECT domain, COUNT(domain) + FROM ( + SELECT + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ( + ingest_request.link_source = 'oai' + OR ( + ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + ) + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + ) t1 + WHERE t1.domain != '' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 40; + + domain | count + ---------------------------------------+-------- + ssl.fao.org | 862277 + www.e-periodica.ch | 828110 + zenodo.org | 686701 + plutof.ut.ee | 685440 + www.gbif.org | 669727 + dlc.library.columbia.edu | 536018 + figshare.com | 383181 + juser.fz-juelich.de | 351519 + statisticaldatasets.data-planet.com | 320415 + espace.library.uq.edu.au | 310767 + invenio.nusl.cz | 309731 + doi.pangaea.de | 306311 + igi.indrastra.com | 297872 + bib-pubdb1.desy.de | 273565 + t2r2.star.titech.ac.jp | 271907 + digi.ub.uni-heidelberg.de | 265519 + www.sciencedirect.com | 263847 + publikationen.bibliothek.kit.edu | 229960 + www.plate-archive.org | 209231 + www.degruyter.com | 189776 + spectradspace.lib.imperial.ac.uk:8443 | 187086 + hal.archives-ouvertes.fr | 185513 + open.library.ubc.ca | 172821 + lup.lub.lu.se | 170063 + books.openedition.org | 169501 + orbi.uliege.be | 161443 + freidok.uni-freiburg.de | 150310 + library.wur.nl | 124318 + digital.library.pitt.edu | 116406 + www.research.manchester.ac.uk | 115869 + www.bibliotecavirtualdeandalucia.es | 114527 + repository.tue.nl | 112157 + www.google.com | 111569 + easy.dans.knaw.nl | 109608 + springernature.figshare.com | 108597 + nbn-resolving.org | 107544 + scholarbank.nus.edu.sg | 107299 + bibliotecavirtualdefensa.es | 105501 + biblio.ugent.be | 100854 + ruj.uj.edu.pl | 99500 + (40 rows) + +For a number of these domains, we do not expect any PDFs to be found, but are +going to re-ingest anyways so they get marked as 'blocked-*' in result table: + +- ssl.fao.org +- plutof.ut.ee +- www.gbif.org + +But some we are just going to skip anyways, because there *could* be PDFs, but +probably *aren't*: + +- zenodo.org +- t2r2.star.titech.ac.jp +- www.google.com +- figshare.com +- springernature.figshare.com + +Dump ingest requests: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ( + ingest_request.link_source = 'oai' + OR ( + ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + ) + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json'; + => COPY 18040676 + +Transform and start ingest: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json + => 18.0M 0:06:45 [44.5k/s] + + cat /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + => DONE + +## Progress Check + +OAI-PMH query: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 13258356 + no-pdf-link | 8685519 + no-capture | 4765663 + redirect-loop | 1557731 + terminal-bad-status | 803373 + link-loop | 453999 + wrong-mimetype | 440230 + null-body | 71457 + cdx-error | 18426 + | 15275 + petabox-error | 13408 + wayback-error | 11845 + blocked-cookie | 11580 + skip-url-blocklist | 7761 + wayback-content-error | 383 + spn2-cdx-lookup-failure | 362 + gateway-timeout | 320 + body-too-large | 207 + spn2-error:job-failed | 191 + redirects-exceeded | 120 + (20 rows) + +OAI-PMH compared to a couple weeks ago: + + 13258356-12872279 = +386,077 success + 8685519-9329602 = -644,083 no-pdf-link + 4765663-4696362 = +69,301 no-capture + 803373-660418 = +142,955 terminal-bad-status + +OA DOI ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------------+--------- + no-pdf-link | 6693547 + success | 5979016 + skip-url-blocklist | 3080986 + no-capture | 1876914 + redirect-loop | 1872817 + terminal-bad-status | 656674 + link-loop | 624290 + blocked-cookie | 448001 + gateway-timeout | 351896 + too-many-redirects | 307895 + forbidden | 306710 + spn2-cdx-lookup-failure | 301312 + cdx-error | 279766 + not-found | 273667 + wrong-mimetype | 83289 + spn2-error | 76806 + spn2-error:too-many-redirects | 58064 + wayback-error | 54278 + spn2-wayback-error | 51768 + remote-server-error | 45683 + (20 rows) + +OA DOI changes: + + 5979016-5690862 = +288,154 success + 6693547-10516478 = -3,822,931 no-pdf-link (still many!) + 1876914-1215179 = +661,735 no-capture + 3080986-265689 = +2,815,297 skip-url-blocklist + +Overall about half a million new 'success', pretty good. over 750k new +no-capture for crawling. + +## Seedlist Dumps + +Note that this is just seedlists, not full ingest requests. + + COPY ( + SELECT ingest_file_result.terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + ) + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + ) TO '/srv/sandcrawler/tasks/patch_2021-09-16_terminal_seedlist.txt'; + => 6,354,365 + +Then run the actual patch crawl! + +## Ingest Requests for Bulk Retry (2022-01-06) + +Crawl has just about completed, so running another round of bulk ingest +requests, slightly updated to allow `https://doi.org/10*` in terminal URL: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.updated <= '2022-01-01' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + ) + AND ( + ingest_request.link_source = 'oai' + OR ( + ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + ) + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json'; + => 4,488,193 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json + => DONE + + cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => TIMEDOUT + => (probably due to re-assignment) + => DONE + +## Stats Again (just OAI-PMH) + +OAI-PMH query: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + +On 2022-02-08: + + status | count + -----------------------+---------- + success | 13505143 + no-pdf-link | 8741007 + no-capture | 4429986 + redirect-loop | 1566611 + terminal-bad-status | 816162 + link-loop | 459006 + wrong-mimetype | 448983 + null-body | 71871 + cdx-error | 19055 + | 15275 + petabox-error | 11713 + blocked-cookie | 11664 + wayback-error | 8745 + skip-url-blocklist | 7828 + max-hops-exceeded | 2031 + wayback-content-error | 338 + body-too-large | 280 + spn2-error:job-failed | 191 + bad-redirect | 134 + redirects-exceeded | 120 + (20 rows) + + +On 2022-02-28, after bulk ingest completed: + + status | count + -----------------------+---------- + success | 14668123 + no-pdf-link | 8822460 + no-capture | 2987565 + redirect-loop | 1629015 + terminal-bad-status | 917851 + wrong-mimetype | 466512 + link-loop | 460941 + null-body | 71457 + cdx-error | 19636 + petabox-error | 16198 + | 15275 + blocked-cookie | 11885 + wayback-error | 8779 + skip-url-blocklist | 7838 + empty-blob | 5906 + max-hops-exceeded | 5563 + wayback-content-error | 355 + body-too-large | 329 + spn2-error:job-failed | 191 + bad-redirect | 137 + (20 rows) + + +Comparing to a couple months ago: + + 14668123-13258356 = +1,409,767 success + 8822460-8685519 = + 136,941 no-pdf-link + 2987565-4765663 = -1,778,098 no-capture + 917851-803373 = + 114,478 terminal-bad-status + diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md new file mode 100644 index 0000000..786c3b2 --- /dev/null +++ b/notes/ingest/2021-12-13_datasets.md @@ -0,0 +1,504 @@ + +First round of production dataset ingest. Aiming to get one or two small +repositories entirely covered, and a few thousand datasets from all supported +platforms. + +Planning to run with sandcrawler in batch mode on `wbgrp-svc263`, expecting up +to a TByte of content locally (on spinning disk). For successful output, will +run through fatcat import; for a subset of unsuccessful, will start a small +heritrix crawl. + + +## Ingest Generation + +Summary: + + wc -l /srv/fatcat/tasks/ingest_dataset_*pilot.json + 2 /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json + 1702 /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json + 2975 /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json + 10000 /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json + 10000 /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json + +All the below ingest requests were combined into a single large file: + + cat /srv/fatcat/tasks/ingest_dataset*pilot.json | shuf | pv -l | gzip > /srv/fatcat/tasks/ingest_dataset_combined.json.gz + # 24.7k 0:00:00 [91.9k/s] + +### Figshare + +- sample 10k datasets (not other types) +- want only "versioned" DOIs; use regex on DOI to ensure + + ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.6084 type:dataset' \ + | rg '10\.6084/m9\.figshare\.\d+.v\d+' \ + | shuf -n10000 \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json + # Counter({'estimate': 505968, 'ingest_request': 50000, 'elasticsearch_release': 50000}) + +### Zenodo + +- has DOIs (of course) +- want only "versioned" DOIs? how to skip? +- sample 10k + + ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.5281 type:dataset' \ + | rg '10\.5281/zenodo' \ + | shuf -n10000 \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json + +### Goettingen Research Online + +- <https://data.goettingen-research-online.de/> +- Dataverse instance, not harvard-hosted +- ~1,400 datasets, ~10,500 files +- has DOIs +- `doi_prefix:10.25625`, then filter to only one slash + + ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query 'doi_prefix:10.25625 type:dataset' \ + | rg -v '10\.25625/[a-z0-9]+/[a-z0-9]' \ + | shuf \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json + # Counter({'ingest_request': 12739, 'elasticsearch_release': 12739, 'estimate': 12739}) # 1.7k 0:01:29 [ 19 /s] + +### Harvard Dataverse + +- main harvard dataverse instance, many "sub-dataverses" +- ~137,000 datasets, ~1,400,000 files +- 10k sample + + ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.7910 type:dataset' \ + | rg '10\.7910/dvn/[a-z0-9]{6}' \ + | rg -v '10\.7910/dvn/[a-z0-9]{6}/[a-z0-9]' \ + | shuf -n10000 \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json + # Counter({'estimate': 660979, 'ingest_request': 50000, 'elasticsearch_release': 50000}) # 2.97k 0:03:26 [14.4 /s] + +Note that this was fewer than expected, but moving on anyways. + +### archive.org + +A couple hand-filtered items. + +"CAT" dataset +- item: <https://archive.org/details/CAT_DATASET> +- fatcat release (for paper): `release_36vy7s5gtba67fmyxlmijpsaui` + +"The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing" +- https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62 +- https://fatcat.wiki/release/7owybd2hrvdmdpm4zpo7hkn2pu (paper) + + + { + "ingest_type": "dataset", + "ingest_request_source": "savepapernow", + "base_url": "https://archive.org/details/CAT_DATASET", + "release_stage": "published", + "fatcat": { + "release_ident": "36vy7s5gtba67fmyxlmijpsaui", + "work_ident": "ycqtbhnfmzamheq2amztiwbsri" + }, + "ext_ids": {}, + "link_source": "spn", + "link_source_id": "36vy7s5gtba67fmyxlmijpsaui" + } + { + "ingest_type": "dataset", + "ingest_request_source": "savepapernow", + "base_url": "https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62", + "release_stage": "published", + "fatcat": { + "release_ident": "7owybd2hrvdmdpm4zpo7hkn2pu", + "work_ident": "3xkz7iffwbdfhbwhnd73iu66cu" + }, + "ext_ids": {}, + "link_source": "spn", + "link_source_id": "7owybd2hrvdmdpm4zpo7hkn2pu" + } + + # paste and then Ctrl-D: + cat | jq . -c > /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json + + +## Ingest Command + +On `wbgrp-svc263`. + +In the current version of tool, `skip_cleanup_local_files=True` by default, so +files will stick around. + +Note that `--no-spn2` is passed, so we are expecting a lot of `no-capture` in the output. + + + # first a small sample + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | head -n5 \ + | pv -l \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.ramp.json + + # ok, run the whole batch through + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | pv -l \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.json + +Got an error: + + internetarchive.exceptions.AuthenticationError: No access_key or secret_key set! Have you run `ia configure`? + +Did a hot patch to try to have the uploads happen under a session, with config from ENV, but didn't work: + + AttributeError: 'ArchiveSession' object has no attribute 'upload' + +Going to hack with config in homedir for now. + +Extract URLs for crawling: + + cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \ + | rg '"no-capture"' \ + | rg -v '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | jq .request.base_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist.base_url.txt + + cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \ + | rg '"no-capture"' \ + | rg '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | rg '"web-' \ + | jq .manifest[].terminal_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist.manifest_terminal.txt + +### Exceptions Encountered + + File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 193, in process + internetarchive.upload + [...] + ConnectionResetError: [Errno 104] Connection reset by peer + urllib3.exceptions.ProtocolError + requests.exceptions.ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), 'https://s3.us.archive.org/zenodo.org-3275525/rhOverM_Asymptotic_GeometricUnits_CoM.h5') + + + Traceback (most recent call last): + File "./ingest_tool.py", line 208, in <module> + main() + File "./ingest_tool.py", line 204, in main + args.func(args) + File "./ingest_tool.py", line 57, in run_requests + result = fileset_worker.process(request) + File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 375, in process + archive_result = strategy_helper.process(dataset_meta) + File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 130, in process + r.raise_for_status() + File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status + raise HTTPError(http_error_msg, response=self) + requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://ndownloader.figshare.com/files/5474201 + +download sometimes just slowly time out, like after a day or more + + + Traceback (most recent call last): + File "./ingest_tool.py", line 208, in <module> + main() + File "./ingest_tool.py", line 204, in main + args.func(args) + File "./ingest_tool.py", line 57, in run_requests + result = fileset_worker.process(request) + File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 381, in process + archive_result = strategy_helper.process(dataset_meta) + File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 155, in process + file_meta = gen_file_metadata_path(local_path, allow_empty=True) + File "/srv/sandcrawler/src/python/sandcrawler/misc.py", line 89, in gen_file_metadata_path + mimetype = magic.Magic(mime=True).from_file(path) + File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/magic/__init__.py", line 111, in from_file + with _real_open(filename): + FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sandcrawler/figshare.com-7925396-v1/HG02070.dedup.realigned.recalibrated.hc.g.vcf.gz' + + + Traceback (most recent call last): + File "./ingest_tool.py", line 208, in <module> + main() + File "./ingest_tool.py", line 204, in main + args.func(args) + File "./ingest_tool.py", line 57, in run_requests + result = fileset_worker.process(request) + File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 314, in process + dataset_meta = platform_helper.process_request(request, resource, html_biblio) + File "/srv/sandcrawler/src/python/sandcrawler/fileset_platforms.py", line 208, in process_request + obj_latest = obj["data"]["latestVersion"] + KeyError: 'latestVersion' + +Fixed the above, trying again: + + git log | head -n1 + # commit ffdc901fa067db55fe6cfeb8d0c3807d29df092c + + Wed Dec 15 21:57:42 UTC 2021 + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | shuf \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results4.json + +Zenodo seems really slow, let's try filtering those out: + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | rg -v 10.5281 \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results5.json + # 3.76k 15:12:53 [68.7m/s] + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | rg -v 10.5281 \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results6.json + +## Fatcat Import + + wc -l ingest_dataset_combined_results*.json + 126 ingest_dataset_combined_results2.json + 153 ingest_dataset_combined_results3.json + 275 ingest_dataset_combined_results4.json + 3762 ingest_dataset_combined_results5.json + 7736 ingest_dataset_combined_results6.json + 182 ingest_dataset_combined_results.json + 5 ingest_dataset_combined_results.ramp.json + 12239 total + + cat ingest_dataset_combined_results*.json \ + | rg '^\{' \ + | jq '[.request.fatcat.release_ident, . | tostring] | @tsv' -r \ + | sort \ + | uniq --check-chars 26 \ + | cut -f2 \ + | rg -v '\\\\' \ + | pv -l \ + > uniq_ingest_dataset_combined_results.json + # 9.48k 0:00:06 [1.54k/s] + + cat uniq_ingest_dataset_combined_results.json | jq .status -r | sort | uniq -c | sort -nr + 7941 no-capture + 374 platform-404 + 369 terminal-bad-status + 348 success-file + 172 success + 79 platform-scope + 77 error-platform-download + 47 empty-manifest + 27 platform-restricted + 20 too-many-files + 12 redirect-loop + 6 error-archiveorg-upload + 3 too-large-size + 3 mismatch + 1 no-platform-match + + cat uniq_ingest_dataset_combined_results.json \ + | rg '"success' \ + | jq 'select(.status == "success") | .' -c \ + > uniq_ingest_dataset_combined_results.success.json + + cat uniq_ingest_dataset_combined_results.json \ + | rg '"success' \ + | jq 'select(.status == "success-file") | .' -c \ + > uniq_ingest_dataset_combined_results.success-file.json + +On fatcat QA instance: + + git log | head -n1 + # commit cca680e2cc4768a4d45e199f6256a433b25b4075 + + head /tmp/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /tmp/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-file-results - + # Counter({'total': 10, 'skip': 10, 'skip-ingest-type': 10, 'insert': 0, 'update': 0, 'exists': 0}) + +Need to update fatcat file worker to support single-file filesets... was that the plan? + + head /tmp/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-no-access-url': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0}) + +Trying again 2022-03-23: + + git log | head -n1 + # commit 134cb050988be2c545af89e0a67c4998307bb819 + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0}) + + head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0}) + + head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0}) + +Fixed a small logic error in insert path. + + head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0}) + +archive.org datasets are *not* getting uploaded with the correct path. path +directory prefixes are getting clobbered. + +## Summary + +As follow-up, it may be worth doing another manual round of ingest requests. +After that, would be good to fill in "glue" code so that this can be done with +kafka workers, and do re-tries/dumps using sandcrawler SQL database. Then can +start scaling up more ingest, using ingest tool, "bulk mode" processing, +heritrix crawls from `no-capture` dumps, etc, similar to bulk file ingest +process. + +For scaling, let's do a "full" ingest request generation of all datasets, and +crawl the base URL with heritrix, in fast/direct mode. Expect this to be tens +of millions of mostly DOIs (doi.org URLs), should crawl quickly. + +Then, do bulk downloading with ingest worker, perhaps on misc-vm or aitio. +uploading large datasets to archive.org, but not doing SPN web requests. Feed +the resulting huge file seedlist into a heritrix crawl to download web files. + +Will need to add support for more specific platforms. + + +### Huge Bulk Ingest Prep + +On prod instance: + + ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query type:dataset \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_dataset_bulk.2022-01-05.json.gz + # Expecting 11264787 release objects in search queries + # TIMEOUT ERROR + # 6.07M 19:13:02 [87.7 /s] (partial) + +As follow-up, should do a full batch (not partial). For now search index is too +unreliable (read timeouts). + + zcat ingest_dataset_bulk.2022-01-05.partial.json.gz \ + | jq .base_url -r \ + | sort -u \ + | shuf \ + | awk '{print "F+ " $1}' \ + > ingest_dataset_bulk.2022-01-05.partial.schedule + +## Retries (2022-01-12) + +This is after having done a bunch of crawling. + + cat ingest_dataset_combined_results6.json \ + | rg '"no-capture"' \ + | jq 'select(.status = "no-capture")' -c \ + | jq .request -c \ + | pv -l \ + > ingest_dataset_retry.json + => 6.51k 0:00:01 [3.55k/s] + + cat /srv/sandcrawler/tasks/ingest_dataset_retry.json \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json + +## Retries (2022-02) + +Finally got things to complete end to end for this batch! + + cat ingest_dataset_retry_results5.json | jq .status -r | sort | uniq -c | sort -nr + 3220 terminal-bad-status + 2120 no-capture + 380 empty-manifest + 264 success-file + 251 success + 126 success-existing + 39 mismatch + 28 error-platform-download + 24 too-many-files + 20 platform-scope + 13 platform-restricted + 13 mismatch-size + 6 too-large-size + 3 transfer-encoding-error + 2 no-platform-match + 2 error-archiveorg-upload + 1 redirect-loop + 1 empty-blob + +Some more URLs to crawl: + + cat ingest_dataset_retry_results5.json \ + | rg '"no-capture"' \ + | rg -v '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | jq .request.base_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist_retries5.base_url.txt + # 1.00 + # just a single DOI that failed to crawl, for whatever reason + + cat ingest_dataset_retry_results5.json \ + | rg '"no-capture"' \ + | rg '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | rg '"web-' \ + | jq .manifest[].terminal_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt + +These are ready to crawl, in the existing dataset crawl. + + cat /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt \ + | sort -u \ + | shuf \ + | awk '{print "F+ " $1}' \ + > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule + +## Running Uploads Again + +Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a +big bummer! Will need to download many of these over again. + + # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316 + # skip_cleanup_local_files=True is still default + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json + + # filter out zenodo, very slow: + # rg -v 10.5281 \ diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md new file mode 100644 index 0000000..941519f --- /dev/null +++ b/notes/ingest/2022-01-06_patch_crawl.md @@ -0,0 +1,398 @@ + +Starting another paper fulltext patch crawl, targetting recent OA content which +has failed to ingest, and platforms (arxiv, etc). + +Specifically: + +- "daily" changelog ingest requests from all time, which failed with various status codes +- pdf no-capture +- SPN errors +- terminal-bad-status with 5xx, 429 +- gateway-timeout +- html no-capture +- html-resource-no-capture + +Most of these are dumped in a single complex query (below), + +TODO: html-resource-no-capture (from error message? or do SPN requests separately?) + + +## Initial 'no-capture' Seedlist + +Dump terminal URLs (will do ingest requests later, using similar command): + + COPY ( + SELECT ingest_file_result.terminal_url + -- SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + ingest_request.link_source = 'oai' + OR ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + OR ingest_request.link_source = 'pmc' + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-12.rows.json'; + ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-01-12.txt'; + => COPY 6389683 + +TODO: filter out archive.org/www.archive.org + + cat patch_terminal_url.2022-01-12.txt \ + | rg -v www.archive.org \ + | rg '://' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | rg -i '^http' \ + | sort -u -S 4G \ + | pv -l \ + > patch_terminal_url.2022-01-12.uniq.txt + => 5.73M 0:00:47 [ 120k/s] + + # note: tweaks and re-ran the above after inspecting this output + cut -f3 -d/ patch_terminal_url.2022-01-12.uniq.txt | sort | uniq -c | sort -nr | head -n25 + 799045 doi.org + 317557 linkinghub.elsevier.com + 211091 arxiv.org + 204334 iopscience.iop.org + 139758 dialnet.unirioja.es + 130331 www.scielo.br + 124626 www.persee.fr + 85764 digitalrepository.unm.edu + 83913 www.mdpi.com + 79662 www.degruyter.com + 75703 www.e-periodica.ch + 72206 dx.doi.org + 69068 escholarship.org + 67848 idus.us.es + 57907 zenodo.org + 56624 ir.opt.ac.cn + 54983 projecteuclid.org + 52226 rep.bntu.by + 48376 osf.io + 48009 pubs.rsc.org + 46947 publikationen.ub.uni-frankfurt.de + 45564 www.research-collection.ethz.ch + 45153 dk.um.si + 43313 www.ssoar.info + 40543 scholarworks.umt.edu + +TODO: cleanup ingest request table in sandcrawler-db: +- remove filtered OAI-PMH prefixes +- remove any invalid `base_url` (?) + +## More Seedlist (2022-02-08) + + COPY ( + SELECT ingest_file_result.terminal_url + -- SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + AND ingest_file_result.updated >= '2022-01-12' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + ingest_request.link_source = 'oai' + OR ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + OR ingest_request.link_source = 'pmc' + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-08.rows.json'; + ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-02-08.txt'; + => COPY 444764 + + cat patch_terminal_url.2022-02-08.txt \ + | rg -v www.archive.org \ + | rg '://' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | rg -i '^http' \ + | sort -u -S 4G \ + | pv -l \ + > patch_terminal_url.2022-02-08.uniq.txt + => 426k 0:00:04 [ 103k/s] + + cut -f3 -d/ patch_terminal_url.2022-02-08.uniq.txt | sort | uniq -c | sort -nr | head -n25 + 60123 www.degruyter.com + 59314 arxiv.org + 43674 zenodo.org + 17771 doi.org + 9501 linkinghub.elsevier.com + 9379 www.mdpi.com + 5691 opendata.uni-halle.de + 5578 scholarlypublishingcollective.org + 5451 era.library.ualberta.ca + 4982 www.cairn.info + 4306 www.taylorfrancis.com + 4189 papers.ssrn.com + 4157 apps.crossref.org + 4089 www.sciencedirect.com + 4033 mdpi-res.com + 3763 dlc.mpg.de + 3408 osf.io + 2603 www.frontiersin.org + 2594 watermark.silverchair.com + 2569 journals.lww.com + 1787 underline.io + 1680 archiviostorico.fondazione1563.it + 1658 www.jstage.jst.go.jp + 1611 cyberleninka.ru + 1535 www.schoeningh.de + + cat patch_terminal_url.2022-02-08.txt | awk '{print "F+ " $1}' > patch_terminal_url.2022-02-08.schedule + => Done + +Copied to crawler svc206 and added to frontier. + + +## Bulk Ingest Requests (2022-02-28) + +Note that we are skipping OAI-PMH here, because we just did a separate ingest +for those. + +This is going to dump many duplicate lines (same `base_url`, multiple +requests), but that is fine. Expecting something like 7 million rows. + + COPY ( + -- SELECT ingest_file_result.terminal_url + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + AND ingest_file_result.updated <= '2022-02-08' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + -- ingest_request.link_source = 'oai' + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + OR ingest_request.link_source = 'pmc' + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json'; + # COPY 3053219 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json + => DONE + + cat /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md new file mode 100644 index 0000000..a6f08dd --- /dev/null +++ b/notes/ingest/2022-01-13_doi_crawl.md @@ -0,0 +1,248 @@ + +Could roll this in to current patch crawl instead of starting a new crawl from scratch. + +This file is misnamed; these are mostly non-DOI-specific small updates. + +## KBART "almost complete" experimentation + +Random 10 releases: + + cat missing_releases.json | shuf -n10 | jq .ident -r | awk '{print "https://fatcat.wiki/release/" $1}' + https://fatcat.wiki/release/suggmo4fnfaave64frttaqqoja - domain gone + https://fatcat.wiki/release/uw2dq2p3mzgolk4alze2smv7bi - DOAJ, then OJS PDF link. sandcrawler failed, fixed + https://fatcat.wiki/release/fjamhzxxdndq5dcariobxvxu3u - OJS; sandcrawler fix works + https://fatcat.wiki/release/z3ubnko5ifcnbhhlegc24kya2u - OJS; sandcrawler failed, fixed (separate pattern) + https://fatcat.wiki/release/pysc3w2cdbehvffbyca4aqex3i - DOAJ, OJS bilingual, failed with 'redirect-loop'. force re-crawl worked for one copy + https://fatcat.wiki/release/am2m5agvjrbvnkstke3o3xtney - not attempted previously (?), success + https://fatcat.wiki/release/4zer6m56zvh6fd3ukpypdu7ita - cover page of journal (not an article). via crossref + https://fatcat.wiki/release/6njc4rdaifbg5jye3bbfdhkbsu - OJS; success + https://fatcat.wiki/release/jnmip3z7xjfsdfeex4piveshvu - OJS; not crawled previously; success + https://fatcat.wiki/release/wjxxcknnpjgtnpbzhzge6rkndi - no-pdf-link, fixed + +Try some more! + + https://fatcat.wiki/release/ywidvbhtfbettmfj7giu2htbdm - not attempted, success + https://fatcat.wiki/release/ou2kqv5k3rbk7iowfohpitelfa - OJS, not attempted, success? + https://fatcat.wiki/release/gv2glplmofeqrlrvfs524v5qa4 - scirp.org; 'redirect-loop'; HTML/PDF/XML all available; then 'gateway-timeout' on retry + https://fatcat.wiki/release/5r5wruxyyrf6jneorux3negwpe - gavinpublishers.com; broken site + https://fatcat.wiki/release/qk4atst6svg4hb73jdwacjcacu - horyzonty.ignatianum.edu.pl; broken DOI + https://fatcat.wiki/release/mp5ec3ycrjauxeve4n4weq7kqm - old cert; OJS; success + https://fatcat.wiki/release/sqnovcsmizckjdlwg3hipxrfqm - not attempted, success + https://fatcat.wiki/release/42ruewjuvbblxgnek6fpj5lp5m - OJS URL, but domain broken + https://fatcat.wiki/release/crg6aiypx5enveldvmwy5judp4 - volume/cover (stub) + https://fatcat.wiki/release/jzih3vvxj5ctxk3tbzyn5kokha - success + + +## Seeds: fixed OJS URLs + +Made some recent changes to sandcrawler, should re-attempt OJS URLs, particularly from DOI or DOAJ, with pattern like: + +- `no-pdf-link` with terminal URL like `/article/view/` +- `redirect-loop` with terminal URL like `/article/view/` + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ( + ingest_file_result.terminal_url LIKE '%/article/view/%' + OR ingest_file_result.terminal_url LIKE '%/article/download/%' + ) + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json'; + => COPY 326577 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json > /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json + cat /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Done/running. + + COPY ( + SELECT ingest_file_result.terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ( + ingest_file_result.status = 'redirect-loop' + OR ingest_file_result.status = 'link-loop' + ) + AND ( + ingest_file_result.terminal_url LIKE '%/article/view/%' + OR ingest_file_result.terminal_url LIKE '%/article/download/%' + ) + ) TO '/srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt'; + => COPY 342415 + + cat /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.schedule + +Done/seeded. + +## Seeds: scitemed.com + +Batch retry sandcrawler `no-pdf-link` with terminal URL like: `scitemed.com/article` + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_file_result.terminal_url LIKE '%/article/view/%' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_scitemed.2022-01-13.rows.json'; + # SKIPPED + +Actually there are very few of these. + +## Seeds: non-OA paper DOIs + +There are many DOIs out there which are likely to be from small publishers, on +the web, and would ingest just fine (eg, in OJS). + + fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' --count + 30,938,106 + + fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'preservation:none' --count + 6,664,347 + + fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'in_kbart:false' --count + 8,258,111 + +Do the 8 million first, then maybe try the 30.9 million later? Do sampling to +see how many are actually accessible? From experience with KBART generation, +many of these are likely to crawl successfully. + + ./fatcat_ingest.py --ingest-type pdf --allow-non-oa query 'in_ia:false is_oa:false doi:* release_type:article-journal container_id:* !publisher_type:big5 in_kbart:false' \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz + # re-running 2022-02-08 after this VM was upgraded + # Expecting 8321448 release objects in search queries + # DONE + +This is large enough that it will probably be a bulk ingest, and then probably +a follow-up crawl. + +## Seeds: HTML and XML links from HTML biblio + + kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -e \ + | pv -l \ + | rg '"(html|xml)_fulltext_url"' \ + | rg '"no-pdf-link"' \ + | gzip \ + > ingest_file_result_fulltext_urls.2022-01-13.json.gz + + # cut this off at some point? gzip is terminated weird + + zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz | wc -l + # gzip: ingest_file_result_fulltext_urls.2022-01-13.json.gz: unexpected end of file + # 2,538,433 + +Prepare seedlists (to include in heritrix patch crawl): + + zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \ + | jq .html_biblio.xml_fulltext_url -r \ + | rg '://' \ + | sort -u -S 4G \ + | pv -l \ + | gzip \ + > ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz + # 1.24M 0:01:35 [12.9k/s] + + zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \ + | jq .html_biblio.html_fulltext_url -r \ + | rg '://' \ + | sort -u -S 4G \ + | pv -l \ + | gzip \ + > ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz + # 549k 0:01:27 [6.31k/s] + + zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \ + | cut -f3 -d/ \ + | sort -S 4G \ + | uniq -c \ + | sort -nr \ + | head -n20 + + 534005 dlc.library.columbia.edu + 355319 www.degruyter.com + 196421 zenodo.org + 101450 serval.unil.ch + 100631 biblio.ugent.be + 47986 digi.ub.uni-heidelberg.de + 39187 www.emerald.com + 33195 www.cairn.info + 25703 boris.unibe.ch + 19516 journals.openedition.org + 15911 academic.oup.com + 11091 repository.dl.itc.u-tokyo.ac.jp + 9847 oxfordworldsclassics.com + 9698 www.thieme-connect.de + 9552 www.idunn.no + 9265 www.zora.uzh.ch + 8030 www.scielo.br + 6543 www.hanspub.org + 6229 asmedigitalcollection.asme.org + 5651 brill.com + + zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \ + | awk '{print "F+ " $1}' \ + > ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule + + wc -l ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule + 1785901 ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule + +Added to `JOURNALS-PATCH-CRAWL-2022-01` + +## Seeds: most doi.org terminal non-success + +Unless it is a 404, should retry. + +TODO: generate this list + +## Non-OA DOI Bulk Ingest + +Had previously run: + + cat ingest_nonoa_doi.json.gz \ + | rg -v "doi.org/10.2139/" \ + | rg -v "doi.org/10.1021/" \ + | rg -v "doi.org/10.1121/" \ + | rg -v "doi.org/10.1515/" \ + | rg -v "doi.org/10.1093/" \ + | rg -v "europepmc.org" \ + | pv -l \ + | gzip \ + > nonoa_doi.filtered.ingests.json.gz + # 7.35M 0:01:13 [99.8k/s] + +Starting a bulk ingest of these on 2022-03-18, which is *before* the crawl has +entirely finished, but after almost all queues (domains) have been done for +several days. + + zcat nonoa_doi.filtered.ingests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Looks like many jstage `no-capture` status; these are still (slowly) crawling. diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md new file mode 100644 index 0000000..9722459 --- /dev/null +++ b/notes/ingest/2022-03_doaj.md @@ -0,0 +1,278 @@ + +plan: +- usual setup and dump ingest requests +- filter ingest requests to targetted ccTLDs, and add those to crawl first + +## Transform and Load + + # on sandcrawler-vm + mkdir -p /srv/sandcrawler/tasks/doaj + cd /srv/sandcrawler/tasks/doaj + wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz' + + # in pipenv, in python directory + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz + # 9.08M 0:37:38 [4.02k/s] + + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request - + # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0}) + # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373}) + + +## Check Pre-Crawl Status + +2022-03-09, before the above load: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 2919808 + html | wrong-scope | 1098998 + pdf | no-pdf-link | 481532 + pdf | redirect-loop | 429006 + html | success | 342501 + html | unknown-scope | 225390 + html | redirect-loop | 223927 + html | html-resource-no-capture | 187762 + html | no-capture | 185418 + pdf | no-capture | 171273 + pdf | null-body | 129028 + html | null-body | 100296 + pdf | terminal-bad-status | 91551 + pdf | link-loop | 25447 + html | wrong-mimetype | 22640 + html | wayback-content-error | 19028 + html | terminal-bad-status | 13327 + pdf | wrong-mimetype | 7688 + xml | success | 6897 + html | petabox-error | 5529 + pdf | wayback-error | 2706 + xml | null-body | 2353 + pdf | | 2063 + pdf | wayback-content-error | 1349 + html | cdx-error | 1169 + pdf | cdx-error | 1130 + pdf | petabox-error | 679 + html | | 620 + pdf | empty-blob | 562 + html | blocked-cookie | 545 + (30 rows) + +After the above load: + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3036457 + pdf | | 1623208 + html | | 1208412 + html | wrong-scope | 1108132 + pdf | no-pdf-link | 485703 + pdf | redirect-loop | 436085 + html | success | 342594 + html | unknown-scope | 225412 + html | redirect-loop | 223927 + html | html-resource-no-capture | 187999 + html | no-capture | 187310 + pdf | no-capture | 172033 + pdf | null-body | 129266 + html | null-body | 100296 + pdf | terminal-bad-status | 91799 + pdf | link-loop | 26933 + html | wrong-mimetype | 22643 + html | wayback-content-error | 19028 + html | terminal-bad-status | 13327 + xml | | 11196 + pdf | wrong-mimetype | 7929 + xml | success | 6897 + html | petabox-error | 5530 + pdf | wayback-error | 2707 + xml | null-body | 2353 + pdf | wayback-content-error | 1353 + pdf | cdx-error | 1177 + html | cdx-error | 1172 + pdf | petabox-error | 771 + pdf | empty-blob | 562 + (30 rows) + +Dump ingest requests for crawling (or bulk ingest first?): + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json'; + => COPY 353819 + +Not that many! Guess the filters are important? + + SELECT COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ); + => 3202164 + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json + => 353k 0:00:16 [21.0k/s] + +Bulk ingest: + + cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Dump seeds again (for crawling): + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json'; + # COPY 350661 + +And stats again: + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3037059 + pdf | | 1623208 + html | | 1208412 + html | wrong-scope | 1108476 + pdf | no-pdf-link | 485705 + pdf | redirect-loop | 436850 + html | success | 342762 + html | unknown-scope | 225412 + html | redirect-loop | 224683 + html | html-resource-no-capture | 188058 + html | no-capture | 185734 + pdf | no-capture | 170452 + pdf | null-body | 129266 + html | null-body | 100296 + pdf | terminal-bad-status | 91875 + pdf | link-loop | 26933 + html | wrong-mimetype | 22643 + html | wayback-content-error | 19042 + html | terminal-bad-status | 13333 + xml | | 11196 + pdf | wrong-mimetype | 7929 + xml | success | 6898 + html | petabox-error | 5535 + pdf | wayback-error | 2711 + xml | null-body | 2353 + pdf | wayback-content-error | 1353 + pdf | cdx-error | 1177 + html | cdx-error | 1172 + pdf | petabox-error | 772 + html | blocked-cookie | 769 + (30 rows) + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json + +Create seedlist: + + cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \ + | jq -r .base_url \ + | sort -u -S 4G \ + > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt + +Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will +re-ingest when that completes (a week or two?). + + +## Bulk Ingest + +After `TARGETED-ARTICLE-CRAWL-2022-03` wrap-up. + + # 2022-03-22 + cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md new file mode 100644 index 0000000..d2a8d71 --- /dev/null +++ b/notes/ingest/2022-03_oaipmh.md @@ -0,0 +1,40 @@ + +Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl. + +Note that Martin excluded many Indonesian endpoints, will need to follow-up on +those. + +## Prep + +Fetch metadata snapshot: + + wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst + + wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst + +Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large): + + zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \ + | rg -v 'oai:kb.dk:' \ + | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \ + | rg -v 'oai:hispana.mcu.es:' \ + | rg -v 'oai:bnf.fr:' \ + | rg -v 'oai:ukm.si:' \ + | rg -v 'oai:biodiversitylibrary.org:' \ + | rg -v 'oai:hsp.org:' \ + | rg -v 'oai:repec:' \ + | rg -v 'oai:n/a:' \ + | rg -v 'oai:quod.lib.umich.edu:' \ + | rg -v 'oai:americanae.aecid.es:' \ + | rg -v 'oai:www.irgrid.ac.cn:' \ + | rg -v 'oai:espace.library.uq.edu:' \ + | rg -v 'oai:edoc.mpg.de:' \ + | rg -v 'oai:bibliotecadigital.jcyl.es:' \ + | rg -v 'oai:repository.erciyes.edu.tr:' \ + | rg -v 'oai:krm.or.kr:' \ + | ./scripts/oai2ingestrequest.py - \ + | pv -l \ + | gzip \ + > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz + +These failed to transform in the expected way; a change in JSON schema from last time? diff --git a/notes/ingest/2022-04_targeted.md b/notes/ingest/2022-04_targeted.md new file mode 100644 index 0000000..23fd35f --- /dev/null +++ b/notes/ingest/2022-04_targeted.md @@ -0,0 +1,144 @@ + +Want to do a crawl similar to recent "patch" crawls, where we run heritrix +crawls to "fill in" missing (`no-capture`) and failed dailing ingests (aka, +those requests coming from fatcat-changelog). + + export PATCHDATE=2022-04-20 + export CRAWLVM=wbgrp-svc279.us.archive.org + export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-04 + +## Seedlist Query + +Terminal URLs dump: + + COPY ( + SELECT row_to_json(t) FROM ( + SELECT ingest_file_result.terminal_url, ingest_request.* + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + -- AND ingest_file_result.updated >= '2022-01-12' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status LIKE 'spn2-%' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'dblp' + OR ingest_request.link_source = 'pmc' + -- OR ingest_request.link_source = 'unpaywall' + -- OR ingest_request.link_source = 'oai' + ) + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + ) t + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-04-20.rows.json'; + # COPY 4842749 + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \ + | rg -v "\\\\" \ + | jq -r .terminal_url \ + | rg '://' \ + | rg -i '^http' \ + | rg -v www.archive.org \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt + # 4.75M 0:01:44 [45.4k/s] + + # check top domains + cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25 + 1515829 www.jstage.jst.go.jp + 1052953 doi.org + 241704 arxiv.org + 219543 www.sciencedirect.com + 178562 www.persee.fr + 84947 zenodo.org + 67397 www.mdpi.com + 65775 journals.lww.com + 58216 opg.optica.org + 50673 osf.io + 45776 www.degruyter.com + 36664 www.indianjournals.com + 35287 pubs.rsc.org + 33495 www.bmj.com + 33320 www.research-collection.ethz.ch + 29728 www.e-periodica.ch + 28338 iopscience.iop.org + 26364 www.cambridge.org + 23840 onlinelibrary.wiley.com + 23641 platform.almanhal.com + 22660 brill.com + 20288 www.osapublishing.org + 18561 cgscholar.com + 18539 doi.nrct.go.th + 15677 www.frontiersin.org + + cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule + + scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp + ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/ + +TODO: starting with the "quarterly retry" script/query might make more sense? +TODO: are there any cases where we do a bulk ingest request, fail, and `terminal_url` is not set? + +## Bulk Ingest Requests (post-crawl) + + cd /srv/sandcrawler/src/python + sudo su sandcrawler + pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json + => 4.84M 0:03:14 [24.9k/s] + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => started 2022-05-11 diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md new file mode 100644 index 0000000..bc78998 --- /dev/null +++ b/notes/ingest/2022-04_unpaywall.md @@ -0,0 +1,278 @@ + +New unpaywall snapshot from `2022-03-09`. + +This will probably be the last unpaywall crawl? Will switch to openalex in the +future, because we can automate that ingest process, and run it on our own +schedule. + + export SNAPSHOT=2022-03-09 + export CRAWLVM=wbgrp-svc279.us.archive.org + export CRAWLNAME=UNPAYWALL-CRAWL-2022-04 + +## Download and Archive + + wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz' + # 2022-04-09 22:31:43 (98.9 KB/s) - ‘unpaywall_snapshot_2022-03-09T083001.jsonl.gz’ saved [29470830470/29470830470] + + export SNAPSHOT=2022-03-09 + ia upload unpaywall_snapshot_$SNAPSHOT unpaywall_snapshot_$SNAPSHOT*.jsonl.gz -m title:"Unpaywall Metadata Snapshot ($SNAPSHOT)" -m collection:ia_biblio_metadata -m creator:creator -m date:$SNAPSHOT + + # if needed + scp unpaywall_snapshot_$SNAPSHOT*.jsonl.gz wbgrp-svc506.us.archive.org:/srv/sandcrawler/tasks + +## Transform and Load + + # in sandcrawler pipenv on sandcrawler1-vm (svc506) + cd /srv/sandcrawler/src/python + sudo su sandcrawler + pipenv shell + + zcat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT*.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json + # 34.9M 3:02:32 [3.19k/s] + + cat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + # 34.9M 5:23:15 [1.80k/s] + # Worker: Counter({'total': 34908779, 'insert-requests': 6129630, 'update-requests': 0}) + # JSON lines pushed: Counter({'total': 34908779, 'pushed': 34908779}) + +So about 6.1M new ingest request rows. + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- take "all time" instead of just this recent capture + -- AND date(ingest_request.created) > '2021-01-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2022-03-09.rows.json'; + => COPY 6025671 + + # transform + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json + # 6.03M 0:03:26 [29.1k/s] + + # enqueue for bulk processing + cat /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + +Only the recent bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2022-04-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + no-capture | 3330232 + success | 2455102 + redirect-loop | 197117 + terminal-bad-status | 82618 + no-pdf-link | 33046 + blocked-cookie | 16078 + link-loop | 6745 + wrong-mimetype | 3416 + wayback-error | 1385 + empty-blob | 1142 + cdx-error | 820 + body-too-large | 292 + bad-gzip-encoding | 281 + wayback-content-error | 267 + | 253 + petabox-error | 215 + skip-url-blocklist | 185 + null-body | 179 + spn2-cdx-lookup-failure | 89 + gateway-timeout | 73 + (20 rows) + +After prior "TARGETED" crawl and bulk ingest finished: + + status | count + -------------------------+--------- + no-capture | 3330055 + success | 2455279 + redirect-loop | 197117 + terminal-bad-status | 82618 + no-pdf-link | 33046 + blocked-cookie | 16079 + link-loop | 6745 + wrong-mimetype | 3416 + wayback-error | 1385 + empty-blob | 1142 + cdx-error | 820 + body-too-large | 292 + bad-gzip-encoding | 281 + wayback-content-error | 267 + | 253 + petabox-error | 215 + skip-url-blocklist | 185 + null-body | 179 + spn2-cdx-lookup-failure | 89 + gateway-timeout | 73 + (20 rows) + +Almost no change, which makes sense because of the `ingest_request.created` +filter. + + +## Dump Seedlist + +Dump rows for crawling: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + -- AND date(ingest_request.created) > '2022-04-01' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status LIKE 'spn2-%' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%.archive.org%' + AND ingest_request.base_url NOT LIKE '%://archive.org%' + AND ingest_request.base_url NOT LIKE '%://doi.org/10.48550/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json'; + => before ingest and arxiv.org DOI exclusion: COPY 3309091 + => COPY 3308914 + + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json + => 3.31M 0:02:22 [23.2k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt | awk '{print "F+ " $1}' | shuf > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule + + wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT* + 15 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.no_terminal_url.txt + 3308914 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json + 3028879 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.terminal_url.txt + 3038725 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.url.txt + +Inject seedlist into crawler: + + scp /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule $CRAWLVM:/tmp + ssh $CRAWLVM sudo -u heritrix cp /tmp/unpaywall_seedlist_$SNAPSHOT.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/ + +Top domains? + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule | cut -f2 -d' ' | cut -f3 -d/ | sort -S 4G | uniq -c | sort -nr | head -n20 + 158497 www.scielo.br + 144732 onlinelibrary.wiley.com + 129349 www.researchsquare.com + 94923 hal.archives-ouvertes.fr + 69293 openresearchlibrary.org + 64584 www.cell.com + 60033 link.springer.com + 50528 www.degruyter.com + 49737 projecteuclid.org + 45841 www.jstage.jst.go.jp + 44819 www.mdpi.com + 44325 ieeexplore.ieee.org + 38091 dr.lib.iastate.edu + 31030 www.nature.com + 30300 discovery.ucl.ac.uk + 27692 ntrs.nasa.gov + 24215 orca.cardiff.ac.uk + 23653 www.frontiersin.org + 23474 pure.rug.nl + 22660 www.sciencedirect.com + + +## Post-Crawl bulk ingest + + # enqueue for bulk processing + cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # done: 2022-07-06 + +## Post-Crawl, Post-Ingest Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2022-04-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 4784948 => +2,329,669 ~77% + redirect-loop | 485270 => + 288,153 ~10% + no-capture | 317598 => -3,012,457 + terminal-bad-status | 267853 => + 185,235 ~ 6% + no-pdf-link | 118303 => + 85,257 + blocked-cookie | 111373 => + 95,294 + skip-url-blocklist | 19368 + link-loop | 9091 + wrong-mimetype | 7163 + cdx-error | 2516 + empty-blob | 1961 + wayback-error | 1922 + body-too-large | 509 + petabox-error | 416 + wayback-content-error | 341 + bad-gzip-encoding | 281 + | 253 + null-body | 179 + spn2-cdx-lookup-failure | 89 + gateway-timeout | 73 + (20 rows) + +Groovy! diff --git a/notes/ingest/2022-07-15_ingest_fixes.md b/notes/ingest/2022-07-15_ingest_fixes.md new file mode 100644 index 0000000..ec31a7d --- /dev/null +++ b/notes/ingest/2022-07-15_ingest_fixes.md @@ -0,0 +1,831 @@ + +## HTML `html-resource-no-capture` Fixes + +Tracing down some `html-resource-no-capture` issues. Eg, `javascript:` resources causing errors. + +SQL query: + + select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' limit 100; + select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' order by random() limit 100; + + select count(*) from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture'; + => 210,528 + +http://agroengineering.it/index.php/jae/article/view/568/609 +- old capture, from `20171017204935` +- missing .css file; seems like an actual case of missing content? +- TODO: re-crawl/re-ingest when CDX is old + +https://www.karger.com/Article/FullText/484130 +- missing: https://www.karger.com/WebMaterial/ShowThumbnail/895999?imgType=2 +- resource is live +- this was from DOI-LANDING crawl, no resources captured +- TODO: re-crawl + +https://www.mdpi.com/1996-1073/13/21/5563/htm +- missing: https://www.mdpi.com/1996-1073/13/21/5563/htm +- common crawl capture; no/few resources? +- TODO: re-crawl + +http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-736X2013000500011&lng=en&tlng=en +- missing: http://www.scielo.br/img/revistas/pvb/v33n5/a11tab01.jpg + not on live web +- old (2013) wide crawl +- TODO: re-crawl + +http://g3journal.org/lookup/doi/10.1534/g3.116.027730 +- missing: http://www.g3journal.org/sites/default/files/highwire/ggg/6/8/2553/embed/mml-math-4.gif +- old 2018 landing crawl (no resources) +- TODO: re-crawl + +https://www.frontiersin.org/articles/10.3389/fimmu.2020.576134/full +- "error_message": "revisit record missing URI and/or DT: warc:abc.net.au-news-20220328-130654/IA-FOC-abc.net.au-news-20220618135308-00003.warc.gz offset:768320762" +- specific URL: https://www.frontiersin.org/areas/articles/js/app?v=uC9Es8wJ9fbTy8Rj4KipiyIXvhx7XEVhCTHvIrM4ShA1 +- archiveteam crawl +- seems like a weird corner case. look at more 'frontiersin' articles, and re-crawl this page + +https://www.frontiersin.org/articles/10.3389/fonc.2020.01386/full +- WORKING + +https://doi.org/10.4000/trajectoires.2317 +- redirect: https://journals.openedition.org/trajectoires/2317 +- missing: "https://journals.openedition.org/trajectoires/Ce fichier n'existe pas" (note spaces) +- FIXED + +http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S1413-81232002000200008&lng=en&tlng=en +- WORKING + +https://f1000research.com/articles/9-571/v2 +- petabox-error on 'https://www.recaptcha.net/recaptcha/api.js' +- added recaptcha.net to blocklist +- still needs a re-crawl +- SPN capture, from 2020, but images were missing? +- re-capture has images (though JS still wonky) +- TODO: re-crawl with SPN2 + +http://bio.biologists.org/content/4/9/1163 +- DOI LANDING crawl, no sub-resources +- TODO: recrawl + +http://err.ersjournals.com/content/26/145/170039.full +- missing: http://err.ersjournals.com/sites/default/files/highwire/errev/26/145/170039/embed/graphic-5.gif + on live web +- 2017 targetted heritrix crawl +- TODO: recrawl + +http://www.dovepress.com/synthesis-characterization-and-antimicrobial-activity-of-an-ampicillin-peer-reviewed-article-IJN +- missing: https://www.dovepress.com/cr_data/article_fulltext/s61000/61143/img/IJN-61143-F02-Thumb.jpg +- recent archiveteam crawl +- TODO: recrawl + +http://journals.ed.ac.uk/lithicstudies/article/view/1444 +- missing: http://journals.ed.ac.uk/lithicstudies/article/download/1444/2078/6081 +- common crawl +- TODO: recrawl + +http://medisan.sld.cu/index.php/san/article/view/495 +- missing: http://ftp.scu.sld.cu/galen/medisan/logos/redib.jpg +- this single resource is legit missing + +seems like it probably isn't a bad idea to just re-crawl all of these with fresh SPNv2 requests + +request sources: +- fatcat-changelog (doi) +- fatcat-ingest (doi) +- doaj + + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'html' + AND ingest_file_result.status = 'html-resource-no-capture' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + ) + ) TO '/srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json'; + => COPY 210749 + + ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json + +Try a sample of 300: + + shuf -n300 /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + +Seeing a bunch of: + + ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fphys.2020.00454/full","https://www.frontiersin.org/articles/10.3389/fphys.2020.00454/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"] + ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fmicb.2019.02507/full","https://www.frontiersin.org/articles/10.3389/fmicb.2019.02507/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"] + ["doaj","wayback-content-error","https://www.mdpi.com/2218-1989/10/9/366","https://www.mdpi.com/2218-1989/10/9/366/htm","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:964129887"] + + "error_message": "revisit record missing URI and/or DT: warc:online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz offset:751923069", + + + ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fnins.2020.00724/full","https://www.frontiersin.org/articles/10.3389/fnins.2020.00724/full","wayback payload sha1hex mismatch: 20220715222216 https://static.frontiersin.org/areas/articles/js/app?v=DfnFHSIgqDJBKQy2bbQ2S8vWyHe2dEMZ1Lg9o6vSS1g1"] + +These seem to be transfer encoding issues; fixed? + + ["doaj","html-resource-no-capture","http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S0021-25712013000400003&lng=en&tlng=en","https://scielosp.org/article/aiss/2013.v49n4/336-339/en/","HTML sub-resource not found: https://ssm.scielo.org/media/assets/css/scielo-print.css"] + +Full batch: + + # TODO: cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + +Not running the full batch for now, because there are almost all `wayback-content-error` issues. + + cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | wc -l + 114935 + + cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + + +## Redirect Loops + +Seems like there might have been a bug in how ingest pipeline dealt with +multiple redirects (eg, 301 to 302 or vice-versa), due to how CDX lookups and +normalization was happening. + +This could be a really big deal because we have over 11 million such ingest +requests! and may even have stopped crawling domains on the basis of redirect +looping. + + select * from ingest_file_result where ingest_type = 'pdf' and status = 'redirect-loop' limit 50; + +http://ieeexplore.ieee.org/iel7/7259950/7275573/07275755.pdf +- 'skip-url-blocklist' +- paywall on live web + +http://www.redjournal.org/article/S0360301616308276/pdf +- redirect to 'secure.jbs.elsevierhealth.com' +- ... but re-crawling with SPNv2 worked +- TODO: reingest this entire journal with SPNv2 + +http://www.jmirs.org/article/S1939865415001551/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- RECRAWL: success + +http://www.cell.com/article/S0006349510026147/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- TODO: try SPNv2? +- RECRAWL: success + +http://infoscience.epfl.ch/record/256431/files/SPL_2018.pdf +- FIXED: success + +http://www.nature.com/articles/hdy1994143.pdf +- blocked-cookie (idp.nature.com / cookies_not_supported) +- RECRAWL: gateway-timeout + +http://www.thelancet.com/article/S0140673619327606/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- RECRAWL: success + +https://pure.mpg.de/pubman/item/item_2065970_2/component/file_2065971/Haase_2014.pdf +- FIXED: success + +http://hdl.handle.net/21.11116/0000-0001-B1A2-F +- FIXED: success + +http://repositorio.ufba.br/ri/bitstream/ri/6072/1/%2858%29v21n6a03.pdf +- FIXED: success + +http://www.jto.org/article/S1556086416329999/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- RECRAWL spn2: success + +http://www.jahonline.org/article/S1054139X16303020/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- RECRAWL spn2: success + +So, wow wow wow, a few things to do here: + +- just re-try all these redirect-loop attempts to update status +- re-ingest all these elsevierhealth blocked crawls with SPNv2. this could take a long time! + +Possibly the elsevierhealth stuff will require some deeper fiddling to crawl +correctly. + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.status = 'redirect-loop' + -- AND ingest_request.ingest_type = 'pdf' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json'; + => COPY 6611342 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json + +Start with a sample: + + shuf -n200 /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Wow that is a lot of ingest! And a healthy fraction of 'success', almost all +via unpaywall (maybe should have done DOAJ/DOI only first). Let's do this full +batch: + + cat /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +TODO: repeat with broader query (eg, OAI-PMH, MAG, etc). + +## Other + +Revist resolution failed: \"Didn't get exact CDX url/datetime match. url:https://www.cairn.info/static/images//logo/logo-cairn-negatif.png dt:20220430145322 got:CdxRow(surt='info,cairn)/static/images/logo/logo-cairn-negatif.png', datetime='20220430145322', url='https://www.cairn.info/static/images/logo/logo-cairn-negatif.png', mimetype='image/png', status_code=200, sha1b32='Y3VQOPO2NFUR2EUWNXLYGYGNZPZLQYHU', sha1hex='c6eb073dda69691d12966dd78360cdcbf2b860f4', warc_csize=10875, warc_offset=2315284914, warc_path='archiveteam_archivebot_go_20220430212134_59230631/old.worldurbancampaign.org-inf-20220430-140628-acnq5-00000.warc.gz')\"" + + https://www.cairn.info/static/images//logo/logo-cairn-negatif.png 20220430145322 + https://www.cairn.info/static/images/logo/logo-cairn-negatif.png 20220430145322 + +Fixed! + + +## Broken WARC Record? + +cdx line: + + net,cloudfront,d1bxh8uas1mnw7)/assets/embed.js 20220716084026 https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js warc/revisit - U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB - - 660 751923069 online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz + +download WARC and run: + + zcat IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz | rg d1bxh8uas1mnw7.cloudfront.net/assets/embed.js -a -C 20 + +the WARC record: + + WARC/1.0 + WARC-Type: revisit + WARC-Target-URI: https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js + WARC-Date: 2022-07-16T08:40:26Z + WARC-Payload-Digest: sha1:U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB + WARC-IP-Address: 13.227.21.220 + WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + WARC-Truncated: length + WARC-Record-ID: <urn:uuid:cc79139e-d43f-4b43-9b9e-f923610344d0> + Content-Type: application/http; msgtype=response + Content-Length: 493 + + HTTP/1.1 200 OK + Content-Type: application/javascript + Content-Length: 512 + Connection: close + Last-Modified: Fri, 22 Apr 2022 08:45:38 GMT + Accept-Ranges: bytes + Server: AmazonS3 + Date: Fri, 15 Jul 2022 16:36:08 GMT + ETag: "1c28db48d4012f0221b63224a3bb7137" + Vary: Accept-Encoding + X-Cache: Hit from cloudfront + Via: 1.1 5b475307685b5cecdd0df414286f5438.cloudfront.net (CloudFront) + X-Amz-Cf-Pop: SFO20-C1 + X-Amz-Cf-Id: SIRR_1LT8mkp3QVaiGYttPuomxyDfJ-vB6dh0Slg_qqyW0_WwnA1eg== + Age: 57859 + +where are the `WARC-Refers-To-Target-URI` and `WARC-Refers-To-Date` lines? + +## osf.io + + select status, terminal_status_code, count(*) from ingest_file_result where base_url LIKE 'https://doi.org/10.17605/osf.io/%' and ingest_type = 'pdf' group by status, terminal_status_code order by count(*) desc limit 30; + + status | terminal_status_code | count + -------------------------+----------------------+------- + terminal-bad-status | 404 | 92110 + no-pdf-link | 200 | 46932 + not-found | 200 | 20212 + no-capture | | 8599 + success | 200 | 7604 + redirect-loop | 301 | 2125 + terminal-bad-status | 503 | 1657 + cdx-error | | 1301 + wrong-mimetype | 200 | 901 + terminal-bad-status | 410 | 364 + read-timeout | | 167 + wayback-error | | 142 + gateway-timeout | | 139 + terminal-bad-status | 500 | 76 + spn2-error | | 63 + spn2-backoff | | 42 + petabox-error | | 39 + spn2-backoff | 200 | 27 + redirect-loop | 302 | 19 + terminal-bad-status | 400 | 15 + terminal-bad-status | 401 | 15 + remote-server-error | | 14 + timeout | | 11 + terminal-bad-status | | 11 + petabox-error | 200 | 10 + empty-blob | 200 | 8 + null-body | 200 | 6 + spn2-error:unknown | | 5 + redirect-loop | 308 | 4 + spn2-cdx-lookup-failure | | 4 + (30 rows) + +Many of these are now non-existant, or datasets/registrations not articles. +Hrm. + + +## Large DOAJ no-pdf-link Domains + + SELECT + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain, + COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_request.base_url = ingest_file_result.base_url + WHERE + ingest_file_result.status = 'no-pdf-link' + AND ingest_request.link_source = 'doaj' + GROUP BY + domain + ORDER BY + COUNT(*) DESC + LIMIT 50; + + domain | count + -------------------------------------------------------+-------- + www.sciencedirect.com | 211090 + auth.openedition.org | 20741 + journal.frontiersin.org:80 | 11368 + journal.frontiersin.org | 6494 + ejde.math.txstate.edu | 4301 + www.arkat-usa.org | 4001 + www.scielo.br | 3736 + www.lcgdbzz.org | 2892 + revistas.uniandes.edu.co | 2715 + scielo.sld.cu | 2612 + www.egms.de | 2488 + journals.lww.com | 2415 + ter-arkhiv.ru | 2239 + www.kitlv-journals.nl | 2076 + www.degruyter.com | 2061 + jwcn-eurasipjournals.springeropen.com | 1929 + www.cjcnn.org | 1908 + www.aimspress.com | 1885 + vsp.spr-journal.ru | 1873 + dx.doi.org | 1648 + www.dlib.si | 1582 + aprendeenlinea.udea.edu.co | 1548 + www.math.u-szeged.hu | 1448 + dergipark.org.tr | 1444 + revistas.uexternado.edu.co | 1429 + learning-analytics.info | 1419 + drive.google.com | 1399 + www.scielo.cl | 1326 + www.economics-ejournal.org | 1267 + www.jssm.org | 1240 + html.rhhz.net | 1232 + journalofinequalitiesandapplications.springeropen.com | 1214 + revistamedicina.net | 1197 + filclass.ru | 1154 + ceramicayvidrio.revistas.csic.es | 1152 + gynecology.orscience.ru | 1126 + www.tobaccoinduceddiseases.org | 1090 + www.tandfonline.com | 1046 + www.querelles-net.de | 1038 + www.swjpcc.com | 1032 + microbiologyjournal.org | 1028 + revistas.usal.es | 1027 + www.medwave.cl | 1023 + ijtech.eng.ui.ac.id | 1023 + www.scielo.sa.cr | 1021 + vestnik.szd.si | 986 + www.biomedcentral.com:80 | 984 + scielo.isciii.es | 983 + bid.ub.edu | 970 + www.meirongtv.com | 959 + (50 rows) + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ejde.math.txstate.edu%' limit 5; + http://ejde.math.txstate.edu/Volumes/2018/30/abstr.html + http://ejde.math.txstate.edu/Volumes/2012/137/abstr.html + http://ejde.math.txstate.edu/Volumes/2016/268/abstr.html + http://ejde.math.txstate.edu/Volumes/2015/194/abstr.html + http://ejde.math.txstate.edu/Volumes/2014/43/abstr.html + # plain HTML, not really parse-able + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.arkat-usa.org%' limit 5; + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913 + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0013.909 + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0007.717 + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.p008.158 + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0014.216 + # fixed (embed PDF) + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.scielo.br%' limit 5; + https://doi.org/10.5935/0034-7280.20200075 + https://doi.org/10.5935/0004-2749.20200071 + https://doi.org/10.5935/0034-7280.20200035 + http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1516-44461999000400014 + https://doi.org/10.5935/0034-7280.20200047 + # need recrawls? + # then success + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.lcgdbzz.org%' limit 5; + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://revistas.uniandes.edu.co%' limit 5; + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://scielo.sld.cu%' limit 5; + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.egms.de%' limit 5; + https://doi.org/10.3205/16dgnc020 + http://nbn-resolving.de/urn:nbn:de:0183-19degam1126 + http://www.egms.de/en/meetings/dgpraec2019/19dgpraec032.shtml + http://www.egms.de/en/meetings/dkou2019/19dkou070.shtml + http://nbn-resolving.de/urn:nbn:de:0183-20nrwgu625 + # mostly abstracts, don't have PDF versions + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ter-arkhiv.ru%' limit 5; + https://doi.org/10.26442/terarkh201890114-47 + https://doi.org/10.26442/00403660.2019.12.000206 + https://journals.eco-vector.com/0040-3660/article/download/32246/pdf + https://journals.eco-vector.com/0040-3660/article/download/33578/pdf + https://doi.org/10.26442/00403660.2019.12.000163 + # working, needed recrawls (some force re-crawls) + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.kitlv-journals.nl%' limit 5; + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.cjcnn.org%' limit 5; + + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.dlib.si%' limit 5; + https://srl.si/ojs/srl/article/view/2910 + https://srl.si/ojs/srl/article/view/3640 + https://srl.si/ojs/srl/article/view/2746 + https://srl.si/ojs/srl/article/view/2557 + https://srl.si/ojs/srl/article/view/2583 + # fixed? (dlib.si) + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.jssm.org%' limit 5; + http://www.jssm.org/vol4/n4/8/v4n4-8text.php + http://www.jssm.org/vol7/n1/19/v7n1-19text.php + http://www.jssm.org/vol9/n3/10/v9n3-10text.php + http://www.jssm.org/abstresearcha.php?id=jssm-14-347.xml + http://www.jssm.org/vol7/n2/11/v7n2-11text.php + # works as an HTML document? otherwise hard to select on PDF link + + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://filclass.ru%' limit 5; + https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism + https://filclass.ru/en/archive/2015/42/training-as-an-effective-form-of-preparation-for-the-final-essay + https://filclass.ru/en/archive/2020/vol-25-3/didaktizatsiya-literatury-rossijskikh-nemtsev-zanyatie-po-poeme-viktora-klyajna-jungengesprach + https://filclass.ru/en/archive/2015/40/the-communicative-behaviour-of-the-russian-intelligentsia-and-its-reflection-in-reviews-as-a-genre-published-in-online-literary-journals-abroad + https://filclass.ru/en/archive/2016/46/discoursive-means-of-implication-of-instructive-components-within-the-anti-utopia-genre + # fixed + # TODO: XXX: re-crawl/ingest + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://microbiologyjournal.org%' limit 5; + https://microbiologyjournal.org/the-relationship-between-the-type-of-infection-and-antibiotic-resistance/ + https://microbiologyjournal.org/antimicrobial-resistant-shiga-toxin-producing-escherichia-coli-isolated-from-ready-to-eat-meat-products-and-fermented-milk-sold-in-the-formal-and-informal-sectors-in-harare-zimbabwe/ + https://microbiologyjournal.org/emerging-antibiotic-resistance-in-mycoplasma-microorganisms-designing-effective-and-novel-drugs-therapeutic-targets-current-knowledge-and-futuristic-prospects/ + https://microbiologyjournal.org/microbiological-and-physicochemicalpropertiesofraw-milkproduced-from-milking-to-delivery-to-milk-plant/ + https://microbiologyjournal.org/association-of-insulin-based-insulin-resistance-with-liver-biomarkers-in-type-2-diabetes-mellitus/ + # HTML article, no PDF + # ... but only sometimes + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.medwave.cl%' limit 5; + http://www.medwave.cl/link.cgi/Medwave/Perspectivas/Cartas/6878 + https://www.medwave.cl/link.cgi/Medwave/Revisiones/RevisionClinica/8037.act + http://dx.doi.org/10.5867/medwave.2012.03.5332 + https://www.medwave.cl/link.cgi/Medwave/Estudios/Casos/7683.act + http://www.medwave.cl/link.cgi/Medwave/Revisiones/CAT/5964 + # HTML article, no PDF + +Re-ingest HTML: + + https://fatcat.wiki/container/mafob4ewkzczviwipyul7knndu (DONE) + https://fatcat.wiki/container/6rgnsrp3rnexdoks3bxcmbleda (DONE) + +Re-ingest PDF: + + doi_prefix:10.5935 (DONE) + doi_prefix:10.26442 + +## More Scielo + +More scielo? `doi_prefix:10.5935 in_ia:false` + + http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873 + # OJS? fixed + + https://revistas.unicentro.br/index.php/repaa/article/view/2667/2240 + # working, but needed re-crawl + + http://www.rbcp.org.br/details/2804/piezoelectric-preservative-rhinoplasty--an-alternative-approach-for-treating-bifid-nose-in-tessier-no--0-facial-cleft + +A few others, mostly now working + +## Recent OA DOIs + + fatcat-cli search release 'is_oa:true (type:article-journal OR type:article OR type:paper-conference) !doi_prefix:10.5281 !doi_prefix:10.6084 !doi_prefix:10.48550 !doi_prefix:10.25446 !doi_prefix:10.25384 doi:* date:>2022-06-15 date:<2022-07-15 in_ia:false !publisher_type:big5' --index-json --limit 0 | pv -l > recent_missing_oa.json + + wc -l recent_missing_oa.json + 24433 + + cat recent_missing_oa.json | jq .doi_prefix -r | sort | uniq -c | sort -nr | head + 4968 10.3390 + 1261 10.1080 + 687 10.23668 + 663 10.1021 + 472 10.1088 + 468 10.4000 + 367 10.3917 + 357 10.1364 + 308 10.4230 + 303 10.17863 + + cat recent_missing_oa.json | jq .doi_registrar -r | sort | uniq -c | sort -nr + 19496 crossref + 4836 datacite + 101 null + + cat recent_missing_oa.json | jq .publisher_type -r | sort | uniq -c | sort -nr + 9575 longtail + 8419 null + 3861 society + 822 unipress + 449 oa + 448 scielo + 430 commercial + 400 repository + 22 other + 7 archive + + cat recent_missing_oa.json | jq .publisher -r | sort | uniq -c | sort -nr | head + 4871 MDPI AG + 1107 Informa UK (Taylor & Francis) + 665 EAG-Publikationen + 631 American Chemical Society + 451 IOP Publishing + 357 The Optical Society + 347 OpenEdition + 309 CAIRN + 308 Schloss Dagstuhl - Leibniz-Zentrum für Informatik + 303 Apollo - University of Cambridge Repository + + cat recent_missing_oa.json | jq .container_name -r | sort | uniq -c | sort -nr | head + 4908 null + 378 Sustainability + 327 ACS Omega + 289 Optics Express + 271 International Journal of Environmental Research and Public Health + 270 International Journal of Health Sciences + 238 Sensors + 223 International Journal of Molecular Sciences + 207 Molecules + 193 Proceedings of the National Academy of Sciences of the United States of America + + cat recent_missing_oa.json \ + | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \ + | wc -l + 16558 + + cat recent_missing_oa.json | rg -i mdpi | shuf -n10 | jq .doi -r + 10.3390/molecules27144419 + => was a 404 + => recrawl was successful + 10.3390/math10142398 + => was a 404 + 10.3390/smartcities5030039 + => was a 404 + +Huh, we need to re-try/re-crawl MDPI URLs every week or so? Or special-case this situation. +Could be just a fatcat script, or a sandcrawler query. + + cat recent_missing_oa.json \ + | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \ + | shuf -n10 | jq .doi -r + + https://doi.org/10.18452/24860 + => success (just needed quarterly retry?) + => b8c6c86aebd6cd2d85515441bbce052bcff033f2 (not in fatcat.wiki) + => current status is "bad-redirect" + https://doi.org/10.26181/20099540.v1 + => success + => 3f9b1ff2a09f3ea9051dbbef277579e8a0b4df30 + => this is figshare, and versioned. PDF was already attached to another DOI: https://doi.org/10.26181/20099540 + https://doi.org/10.4230/lipics.sea.2022.22 + => there is a bug resulting in trailing slash in `citation_pdf_url` + => fixed as a quirks mode + => emailed to report + https://doi.org/10.3897/aca.5.e89679 + => success + => e6fd1e066c8a323dc56246631748202d5fb48808 + => current status is 'bad-redirect' + https://doi.org/10.1103/physrevd.105.115035 + => was 404 + => success after force-recrawl of the terminal URL (not base URL) + https://doi.org/10.1155/2022/4649660 + => was 404 + => success after force-recrawl (of base_url) + https://doi.org/10.1090/spmj/1719 + => paywall (not actually OA) + => https://fatcat.wiki/container/x6jfhegb3fbv3bcbqn2i3espiu is on Szczepanski list, but isn't all OA? + https://doi.org/10.1139/as-2022-0011 + => was no-pdf-link + => fixed fulltext URL extraction + => still needed to re-crawl terminal PDF link? hrm + https://doi.org/10.31703/grr.2022(vii-ii).02 + => was no-pdf-link + => fixed! success + https://doi.org/10.1128/spectrum.00154-22 + => was 404 + => now repeatably 503, via SPN + https://doi.org/10.51601/ijersc.v3i3.393 + => 503 server error + https://doi.org/10.25416/ntr.20137379.v1 + => is figshare + => docx (not PDF) + https://doi.org/10.25394/pgs.20263698.v1 + => figshare + => embargo'd + https://doi.org/10.24850/j-tyca-14-4-7 + => was no-pdf-link + => docs.google.com/viewer (!) + => now handle this (success) + https://doi.org/10.26267/unipi_dione/1832 + => was bad-redirect + => success + https://doi.org/10.25560/98019 + => body-too-large + => also, PDF metadata fails to parse + => is actually like 388 MByte + https://doi.org/10.14738/abr.106.12511 + => max-hops-exceeded + => bumped max-hops from 6 to 8 + => then success (via google drive) + https://doi.org/10.24350/cirm.v.19933803 + => video, not PDF + https://doi.org/10.2140/pjm.2022.317.67 + => link-loop + => not actually OA + https://doi.org/10.26265/polynoe-2306 + => was bad-redirect + => now success + https://doi.org/10.3389/fpls.2022.826875 + => frontiers + => was terminal-bad-status (403) + => success on retry (not sure why) + => maybe this is also a date-of-publication thing? + => not sure all these should be retried though + https://doi.org/10.14198/medcom.22240 + => was terminal-bad-status (404) + => force-recrawl resulted in an actual landing page, but still no-pdf-link + => but actual PDF is a real 404, it seems. oh well + https://doi.org/10.31729/jnma.7579 + => no-capture + https://doi.org/10.25373/ctsnet.20146931.v2 + => figshare + => video, not document or PDF + https://doi.org/10.1007/s42600-022-00224-0 + => not yet crawled/attempted (!) + => springer + => not actually OA + https://doi.org/10.37391/ijeer.100207 + => some upstream issue (server not found) + https://doi.org/10.1063/5.0093946 + => aip.scitation.org, is actually OA (can download in browser) + => cookie trap? + => redirect-loop (seems like a true redirect loop) + => retrying the terminal PDF URL seems to have worked + https://doi.org/10.18502/jchr.v11i2.9998 + => no actual fulltext on publisher site + https://doi.org/10.1128/spectrum.01144-22 + => this is a 503 error, even after retrying. weird! + +DONE: check `publisher_type` in chocula for: +- "MDPI AG" +- "Informa UK (Taylor & Francis)" + + cat recent_missing_oa.json | jq '[.publisher, .publisher_type]' -c | sort | uniq -c | sort -nr | head -n40 + 4819 ["MDPI AG","longtail"] + 924 ["Informa UK (Taylor & Francis)",null] + 665 ["EAG-Publikationen",null] + 631 ["American Chemical Society","society"] + 449 ["IOP Publishing","society"] + 357 ["The Optical Society","society"] + 336 ["OpenEdition","oa"] + 309 ["CAIRN","repository"] + 308 ["Schloss Dagstuhl - Leibniz-Zentrum für Informatik",null] + 303 ["Apollo - University of Cambridge Repository",null] + 292 ["Springer (Biomed Central Ltd.)",null] + 275 ["Purdue University Graduate School",null] + 270 ["Suryasa and Sons","longtail"] + 257 ["La Trobe",null] + 216 ["Frontiers Media SA","longtail"] + 193 ["Proceedings of the National Academy of Sciences","society"] + 182 ["Informa UK (Taylor & Francis)","longtail"] + 176 ["American Physical Society","society"] + 168 ["Institution of Electrical Engineers","society"] + 166 ["Oxford University Press","unipress"] + 153 ["Loughborough University",null] + + chocula mostly seems to set these correctly. is the issue that the chocula + computed values aren't coming through or getting updated? probably. both + the release (from container) metadata update; and chocula importer not + doing updates based on this field; and some old/incorrect values. + + did some cleanups of specific containers, and next chocula update should + result in a bunch more `publisher_type` getting populated on older + containers + + +TODO: verify URLs are actualy URLs... somewhere? in the ingest pipeline + +TODO: fatcat: don't ingest figshare "work" DOIs, only the "versioned" ones (?) + doi_prefix:10.26181 + +WIP: sandcrawler: regularly (weekly?) re-try 404 errors (the terminal URL, not the base url?) (or, some kind of delay?) + doi_prefix:10.3390 (MDPI) + doi_prefix:10.1103 + doi_prefix:10.1155 + +DONE: simply re-ingest all: + doi_prefix:10.4230 + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf query 'doi_prefix:10.4230' + # Counter({'ingest_request': 2096, 'elasticsearch_release': 2096, 'estimate': 2096, 'kafka': 2096}) + container_65lzi3vohrat5nnymk3dqpoycy + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 65lzi3vohrat5nnymk3dqpoycy + # Counter({'ingest_request': 187, 'elasticsearch_release': 187, 'estimate': 187, 'kafka': 187}) + container_5vp2bio65jdc3blx6rfhp3chde + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 5vp2bio65jdc3blx6rfhp3chde + # Counter({'ingest_request': 83, 'elasticsearch_release': 83, 'estimate': 83, 'kafka': 83}) + +DONE: verify and maybe re-ingest all: + is_oa:true publisher:"Canadian Science Publishing" in_ia:false + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --allow-non-oa --ingest-type pdf --force-recrawl query 'year:>2010 is_oa:true publisher:"Canadian Science Publishing" in_ia:false !journal:print' + # Counter({'ingest_request': 1041, 'elasticsearch_release': 1041, 'estimate': 1041, 'kafka': 1041}) + + +## Re-Ingest bad-redirect, max-hops-exceeded, and google drive + +Similar to `redirect-loop`: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.status = 'bad-redirect' + -- AND ingest_request.ingest_type = 'pdf' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json'; + # COPY 100011 + # after first run: COPY 5611 + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.status = 'max-hops-exceeded' + -- AND ingest_request.ingest_type = 'pdf' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json'; + # COPY 3546 + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.hit is false + AND ingest_file_result.terminal_url like 'https://docs.google.com/viewer%' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json'; + # COPY 1082 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json + + cat /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + cat /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + cat /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + # DONE diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md new file mode 100644 index 0000000..74aeb8d --- /dev/null +++ b/notes/ingest/2022-07-19_dblp.md @@ -0,0 +1,50 @@ + +Cross-posting from fatcat bulk metadata update/ingest. + + zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 631k 0:00:11 [54.0k/s] + + +## Post-Crawl Stats + +This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run +2022-09-06: + + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'dblp' + GROUP BY ingest_request.ingest_type, status + -- ORDER BY ingest_request.ingest_type, COUNT DESC + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+-----------------------+-------- + pdf | success | 305142 + pdf | no-pdf-link | 192683 + pdf | no-capture | 42634 + pdf | terminal-bad-status | 38041 + pdf | skip-url-blocklist | 31055 + pdf | link-loop | 9263 + pdf | wrong-mimetype | 4545 + pdf | redirect-loop | 3952 + pdf | empty-blob | 2705 + pdf | wayback-content-error | 834 + pdf | wayback-error | 294 + pdf | petabox-error | 202 + pdf | blocked-cookie | 155 + pdf | cdx-error | 115 + pdf | body-too-large | 66 + pdf | bad-redirect | 19 + pdf | timeout | 7 + pdf | bad-gzip-encoding | 4 + (18 rows) + +That is quite a lot of `no-pdf-link`, might be worth doing a random sample +and/or re-ingest. And a chunk of `no-capture` to retry. diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md new file mode 100644 index 0000000..7e55633 --- /dev/null +++ b/notes/ingest/2022-07_doaj.md @@ -0,0 +1,199 @@ + +This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for +heritrix bulk crawling, along with JALC and DOAJ URLs. + + export SNAPSHOT=2022-07-20 + +## Transform and Load + + # on sandcrawler-vm + mkdir -p /srv/sandcrawler/tasks/doaj + cd /srv/sandcrawler/tasks/doaj + wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz" + + # in pipenv, in python directory + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz + # 9.72M 0:36:28 [4.44k/s] + + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request - + # 9.72M 0:17:04 [9.49k/s] + # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0}) + # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097}) + +Stats after this load: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3165539 + pdf | | 2078874 + html | | 1547698 + html | wrong-scope | 1114332 + pdf | no-pdf-link | 517261 + html | success | 388376 + html | unknown-scope | 242044 + pdf | no-capture | 179030 + pdf | terminal-bad-status | 174741 + html | no-capture | 155323 + pdf | null-body | 129267 + pdf | redirect-loop | 127136 + html | html-resource-no-capture | 117275 + html | null-body | 100296 + pdf | blocked-cookie | 71093 + html | redirect-loop | 65519 + html | terminal-bad-status | 64856 + html | blocked-cookie | 64095 + html | spn2-backoff | 55173 + pdf | link-loop | 27440 + html | wrong-mimetype | 26016 + html | wayback-content-error | 20109 + xml | | 13624 + pdf | wrong-mimetype | 8411 + xml | success | 6899 + html | petabox-error | 6199 + html | wayback-error | 5269 + html | spn2-cdx-lookup-failure | 4635 + html | spn2-recent-capture | 4527 + xml | null-body | 2353 + (30 rows) + +## Bulk Ingest + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json'; + # COPY 3962331 + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json + # 3.96M 0:01:47 [36.7k/s] + +Top domains: + + cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20 + 789988 www.mdpi.com + 318142 www.frontiersin.org + 226316 link.springer.com + 204429 www.scielo.br + 201175 www.sciencedirect.com + 72852 ieeexplore.ieee.org + 68983 dx.doi.org + 33286 www.dovepress.com + 26020 elifesciences.org + 23838 www.cetjournal.it + 21102 mab-online.nl + 20242 www.revistas.usp.br + 16564 periodicos.uem.br + 15710 journals.openedition.org + 14514 dergipark.org.tr + 14072 apcz.umk.pl + 13924 ojs.minions.amsterdam + 13717 bmgn-lchr.nl + 13512 ojstest.minions.amsterdam + 10440 journals.asm.org + +Bulk ingest: + + cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # Done + +## Stats Again + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- ORDER BY ingest_request.ingest_type, COUNT DESC + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 4704006 + html | wrong-scope | 1761227 + html | success | 778165 + pdf | no-pdf-link | 759805 + html | no-capture | 382080 + html | unknown-scope | 313391 + html | html-resource-no-capture | 292953 + pdf | no-capture | 290311 + pdf | terminal-bad-status | 271776 + pdf | null-body | 129267 + pdf | blocked-cookie | 108491 + html | terminal-bad-status | 103014 + html | null-body | 100296 + html | blocked-cookie | 88533 + pdf | | 81517 + pdf | skip-url-blocklist | 76443 + html | spn2-backoff | 50615 + pdf | link-loop | 45516 + html | wrong-mimetype | 33525 + html | wayback-content-error | 25535 + pdf | empty-blob | 21431 + pdf | redirect-loop | 19795 + html | petabox-error | 18291 + html | empty-blob | 14391 + pdf | wrong-mimetype | 14084 + html | redirect-loop | 12856 + xml | success | 10381 + xml | no-capture | 10008 + html | skip-url-blocklist | 3294 + html | cdx-error | 3275 + (30 rows) + +Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k +PDFs with no attempt at all? Maybe a filter, or bogus URLs. + +Over 1.5M new PDF success over this crawl iteration period, nice. diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md new file mode 100644 index 0000000..415f23b --- /dev/null +++ b/notes/ingest/2022-07_targeted.md @@ -0,0 +1,140 @@ + +Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs. + + export PATCHDATE=2022-07-29 + export CRAWLVM=wbgrp-svc279.us.archive.org + export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07 + +## Seedlist Query + +Terminal URLs dump: + + COPY ( + SELECT row_to_json(t) FROM ( + SELECT ingest_file_result.terminal_url, ingest_request.* + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + -- AND ingest_file_result.updated >= '2022-01-12' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status LIKE 'spn2-%' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + OR ingest_file_result.terminal_status_code = 429 + ) + ) + ) + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'dblp' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'pmc' + -- OR ingest_request.link_source = 'unpaywall' + -- OR ingest_request.link_source = 'oai' + ) + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + ) t + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json'; + => COPY 3524573 + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \ + | rg -v "\\\\" \ + | jq -r .terminal_url \ + | rg '://' \ + | rg -i '^http' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt + => 3.11M 0:01:08 [45.4k/s] + + # check top domains + cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25 + 624948 doi.org + 382492 www.jstage.jst.go.jp + 275087 www.mdpi.com + 157134 www.persee.fr + 108979 www.sciencedirect.com + 94375 www.scielo.br + 50834 onlinelibrary.wiley.com + 49991 journals.lww.com + 30354 www.frontiersin.org + 27963 doaj.org + 27058 www.e-periodica.ch + 24147 dl.acm.org + 23389 aclanthology.org + 22086 www.research-collection.ethz.ch + 21589 medien.die-bonn.de + 18866 www.ingentaconnect.com + 18583 doi.nrct.go.th + 18271 repositories.lib.utexas.edu + 17634 hdl.handle.net + 16366 archives.datapages.com + 15146 cgscholar.com + 13987 dl.gi.de + 13188 www.degruyter.com + 12503 ethos.bl.uk + 12304 preprints.jmir.org + + cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule + => done + + scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp + ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/ + + +## Re-Ingest + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json + => 3.52M 0:01:37 [36.2k/s] + +Ingest: + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md new file mode 100644 index 0000000..ac7c68f --- /dev/null +++ b/notes/ingest/2022-09_oaipmh.md @@ -0,0 +1,397 @@ + +Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921> + +I updated the transform script to block some additional domains. + + +## Prep + +Fetch the snapshot: + + cd /srv/sandcrawler/tasks/ + wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst + +Transform to ingest requests: + + cd /srv/sandcrawler/src/python + git log | head -n1 + # commit dfd4605d84712eccb95a63e50b0bcb343642b433 + + pipenv shell + zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \ + | ./scripts/oai2ingestrequest.py - \ + | pv -l \ + | gzip \ + > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz + # 16.1M 1:01:02 [4.38k/s] + +Curious about types, though this would probably be handled at fatcat ingest +time: + + zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt + + head oai_type_counts.txt -n30 + 5623867 info:eu-repo/semantics/article + 5334928 info:eu-repo/semantics/publishedVersion + 3870359 text + 1240225 Text + 829169 Article + 769849 NonPeerReviewed + 665700 PeerReviewed + 648740 Peer-reviewed Article + 547857 article + 482906 info:eu-repo/semantics/bachelorThesis + 353814 Thesis + 329269 Student thesis + 262650 info:eu-repo/semantics/conferenceObject + 185354 Journal articles + 162021 info:eu-repo/semantics/doctoralThesis + 152079 Journal Article + 150226 Research Article + 130217 Conference papers + 127255 Artículo revisado por pares + 124243 Newspaper + 123908 ##rt.metadata.pkp.peerReviewed## + 123309 Photograph + 122981 info:eu-repo/semantics/masterThesis + 116719 Book + 108946 Image + 108216 Report + 107946 Other + 103562 masterThesis + 103038 info:eu-repo/semantics/other + 101404 StillImage + [...] + +And formats: + + zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt + + head -n 20 oai_format_counts.txt + 11151928 application/pdf + 677413 text + 561656 text/html + 498518 image/jpeg + 231219 Text + 193638 text/xml + 147214 Image + 117073 image/jpg + 110872 pdf + 91323 image/tiff + 76948 bib + 75393 application/xml + 70244 Digitized from 35 mm. microfilm. + 68206 mods + 59227 PDF + 57677 application/epub+zip + 57602 application/octet-stream + 52072 text/plain + 51620 application/msword + 47227 audio/mpeg + +Also, just overall size (number of records): + + zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l + # 20,840,301 + +Next load in to sandcrawler DB: + + zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request - + + Traceback (most recent call last): + File "./persist_tool.py", line 311, in <module> + main() + File "./persist_tool.py", line 307, in main + args.func(args) + File "./persist_tool.py", line 119, in run_ingest_request + pusher.run() + File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run + self.worker.push_batch(batch) + File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch + resp = self.db.insert_ingest_request(self.cur, irequests) + File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request + resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) + File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values + cur.execute(b''.join(parts)) + psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx" + DETAIL: Index row references tuple (6893121,3) in relation "ingest_request". + HINT: Values larger than 1/3 of a buffer page cannot be indexed. + Consider a function index of an MD5 hash of the value, or use full text indexing. + 15.7M 0:41:48 [6.27k/s] + +Darn, this means we won't get reasonable stats about how many rows were +inserted/updated. + +Patched the persist tool to skip very long URLs, and ran again (backwards, just +URLs which didn't get inserted already): + + zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \ + | tac \ + | head -n1000000 \ + | pv -l \ + | ./persist_tool.py ingest-request - + # 1.00M 0:03:04 [5.41k/s] + # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0}) + +Status of just the new lines: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+--------- + | 6398455 + success | 540219 + no-pdf-link | 41316 + link-loop | 23871 + no-capture | 11350 + redirect-loop | 8315 + wrong-mimetype | 2394 + terminal-bad-status | 1540 + null-body | 1038 + cdx-error | 272 + empty-blob | 237 + petabox-error | 213 + wayback-error | 186 + blocked-cookie | 107 + timeout | 47 + wayback-content-error | 26 + spn2-cdx-lookup-failure | 21 + skip-url-blocklist | 16 + spn2-backoff | 15 + body-too-large | 13 + (20 rows) + + +## Bulk Ingest + +Should already have filtered domains/prefixes in transform script, so not +including filters here. + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + AND ingest_file_result.status IS NULL + ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json'; + # COPY 6398455 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \ + | pv -l \ + | shuf \ + > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json + # 6.40M 0:02:18 [46.2k/s] + + cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # DONE + +Expect this ingest to take a week or so. + +Then, run stats again: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + no-capture | 3617175 + success | 2775036 + no-pdf-link | 449298 + link-loop | 74260 + terminal-bad-status | 47819 + wrong-mimetype | 20195 + redirect-loop | 18197 + empty-blob | 12127 + cdx-error | 3038 + skip-url-blocklist | 2630 + wayback-error | 2599 + petabox-error | 2354 + wayback-content-error | 1617 + blocked-cookie | 1293 + null-body | 1038 + body-too-large | 670 + | 143 + bad-gzip-encoding | 64 + timeout | 47 + spn2-cdx-lookup-failure | 20 + (20 rows) + + +## Crawl Seedlist + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'redirect-loop' + OR ingest_file_result.status = 'terminal-bad-status' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'timeout' + OR ingest_file_result.status = 'wayback-content-error' + ) + ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json'; + => COPY 3692846 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ + | pv -l \ + | shuf \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json + => 3.69M 0:01:19 [46.6k/s] + +This will be used for re-ingest later. For now, extract URLs: + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ + | jq .base_url -r \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt + => 3.66M 0:00:59 [61.8k/s] + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ + | rg '"terminal_url"' \ + | jq -r .result.terminal_url \ + | rg -v ^null$ \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt + => 0.00 0:00:05 [0.00 /s] + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \ + | awk '{print "F+ " $1}' \ + | shuf \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + +What domains are we crawling? + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \ + | sort -u -S 4G \ + | cut -d/ -f3 \ + | sort \ + | uniq -c \ + | sort -nr \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt + + head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt + 91899 raco.cat + 70116 islandora.wrlc.org + 68708 urn.kb.se + 63726 citeseerx.ist.psu.edu + 50370 publications.rwth-aachen.de + 44885 urn.nsk.hr + 38429 server15795.contentdm.oclc.org + 33041 periodicos.ufpb.br + 32519 nbn-resolving.org + 31990 www.ajol.info + 24745 hal.archives-ouvertes.fr + 22569 id.nii.ac.jp + 17239 tilburguniversity.on.worldcat.org + 15873 dspace.nbuv.gov.ua + 15436 digitalcommons.wustl.edu + 14885 www.iiste.org + 14623 www.manchester.ac.uk + 14033 nbn-resolving.de + 13999 opus4.kobv.de + 13689 www.redalyc.org + +Sizes: + + wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + + 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt + 0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt + 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + + +Copy seedlist to crawler: + + # as regular user + scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp + +## Post-Crawl Bulk Ingest + + # ran 2022-11-16, after crawl cleanup + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -----------------------+--------- + success | 4721164 +1,946,128 + no-pdf-link | 1116290 + no-capture | 673939 + terminal-bad-status | 232217 + link-loop | 148544 + wrong-mimetype | 68841 + redirect-loop | 26262 + empty-blob | 17759 + cdx-error | 6570 + blocked-cookie | 4026 + blocked-wall | 3054 + skip-url-blocklist | 2924 + body-too-large | 2404 + bad-redirect | 1565 + wayback-error | 1320 + petabox-error | 1083 + null-body | 1038 + wayback-content-error | 264 + bad-gzip-encoding | 150 + | 143 + (20 rows) + diff --git a/notes/ingest_domains.txt b/notes/ingest_domains.txt new file mode 100644 index 0000000..ae06272 --- /dev/null +++ b/notes/ingest_domains.txt @@ -0,0 +1,294 @@ + +## Queries to find broken domains + +Top domains with failed ingests: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + +Status overview for a particular domain: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'osapublishing.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT domain, terminal_status_code, COUNT((domain, terminal_status_code)) + FROM (SELECT terminal_status_code, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'osapublishing.org' + AND t1.terminal_status_code is not null + GROUP BY domain, terminal_status_code + ORDER BY COUNT DESC; + +Sample recent failures: + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%osapublishing.org%' + AND status = 'terminal-bad-status' + ORDER BY updated DESC + LIMIT 10; + + +## Failing + +www.osapublishing.org + + this publisher (The Optical Society) is systemically using a CAPTCHA to + gate access to PDFs. bummer! could ask them to white-list? + + has citation_pdf_url, so that isn't an issue + + status: "no-pdf-link" + hops: + "https://doi.org/10.1364/optica.6.000798", + "https://www.osapublishing.org/viewmedia.cfm?uri=optica-6-6-798&seq=0" + "https://www.osapublishing.org/captcha/?guid=830CEAB5-09BD-6140-EABD-751200C78B1C" + + domain | status | count + -----------------------+---------------------+------- + www.osapublishing.org | no-capture | 16680 + www.osapublishing.org | no-pdf-link | 373 + www.osapublishing.org | redirect-loop | 19 + www.osapublishing.org | terminal-bad-status | 5 + www.osapublishing.org | cdx-error | 1 + www.osapublishing.org | wrong-mimetype | 1 + www.osapublishing.org | spn-error | 1 + www.osapublishing.org | success | 1 + www.osapublishing.org | wayback-error | 1 + (9 rows) + +www.persee.fr + + Seems to be mostly blocking or rate-limiting? + + domain | status | count + ---------------+-------------------------------------+------- + www.persee.fr | no-capture | 37862 + www.persee.fr | terminal-bad-status | 3134 + www.persee.fr | gateway-timeout | 2828 + www.persee.fr | no-pdf-link | 431 + www.persee.fr | spn-error | 75 + www.persee.fr | redirect-loop | 23 + www.persee.fr | success | 8 + www.persee.fr | spn2-error | 2 + www.persee.fr | spn2-error:soft-time-limit-exceeded | 1 + www.persee.fr | wrong-mimetype | 1 + (10 rows) + +journals.openedition.org + + PDF access is via "freemium" subscription. Get redirects to: + + https://auth.openedition.org/authorized_ip?url=http%3A%2F%2Fjournals.openedition.org%2Fnuevomundo%2Fpdf%2F61053 + + Content is technically open access (HTML and license; for all content?), + but can't be crawled as PDF without subscription. + + domain | status | count + --------------------------+-------------------------+------- + journals.openedition.org | redirect-loop | 29587 + journals.openedition.org | success | 6821 + journals.openedition.org | no-pdf-link | 1507 + journals.openedition.org | no-capture | 412 + journals.openedition.org | wayback-error | 32 + journals.openedition.org | wrong-mimetype | 27 + journals.openedition.org | terminal-bad-status | 13 + journals.openedition.org | spn2-cdx-lookup-failure | 4 + journals.openedition.org | spn-remote-error | 1 + journals.openedition.org | null-body | 1 + journals.openedition.org | cdx-error | 1 + (11 rows) + +journals.lww.com + + no-pdf-link + + domain | status | count + ------------------+----------------+------- + journals.lww.com | no-pdf-link | 11668 + journals.lww.com | wrong-mimetype | 131 + (2 rows) + + doi prefix: 10.1097 + + <meta name="wkhealth_pdf_url" content="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf" /> + data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw==" + + Some weird thing going on, maybe they are blocking-via-redirect based on + our User-Agent? Seems like wget works, so funny that they don't block that. + +musewide.aip.de + + no-pdf-link + +koreascience.or.kr | no-pdf-link | 8867 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'osapublishing.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%osapublishing.org%' + AND status = 'terminal-bad-status' + ORDER BY updated DESC + LIMIT 10; + +www.cairn.info | link-loop | 8717 + +easy.dans.knaw.nl | no-pdf-link | 8262 +scielo.conicyt.cl | no-pdf-link | 7925 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'scielo.conicyt.cl' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%scielo.conicyt.cl%' + AND status = 'terminal-bad-status' + ORDER BY updated DESC + LIMIT 10; + + + domain | status | count + -------------------+---------------------+------- + scielo.conicyt.cl | no-pdf-link | 7926 + scielo.conicyt.cl | success | 4972 + scielo.conicyt.cl | terminal-bad-status | 1474 + scielo.conicyt.cl | wrong-mimetype | 6 + scielo.conicyt.cl | no-capture | 4 + scielo.conicyt.cl | null-body | 1 + + + pdf | https://doi.org/10.4067/s0370-41061980000300002 | 2020-02-22 23:55:56.235822+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0370-41061980000300002&lng=en&nrm=iso&tlng=en | 20200212201727 | 200 | + pdf | https://doi.org/10.4067/s0718-221x2019005000201 | 2020-02-22 23:01:49.070104+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-221X2019005000201&lng=en&nrm=iso&tlng=en | 20200214105308 | 200 | + pdf | https://doi.org/10.4067/s0717-75262011000200002 | 2020-02-22 22:49:36.429717+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-75262011000200002&lng=en&nrm=iso&tlng=en | 20200211205804 | 200 | + pdf | https://doi.org/10.4067/s0717-95022006000400029 | 2020-02-22 22:33:07.761766+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-95022006000400029&lng=en&nrm=iso&tlng=en | 20200209044048 | 200 | + + These seem, on retry, like success? Maybe previous was a matter of warc/revisit not getting handled correctly? + + pdf | https://doi.org/10.4067/s0250-71611998007100009 | 2020-02-22 23:57:16.481703+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0250-71611998007100009&lng=en&nrm=iso&tlng=en | 20200212122939 | 200 | + pdf | https://doi.org/10.4067/s0716-27902005020300006 | 2020-02-22 23:56:01.247616+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0716-27902005020300006&lng=en&nrm=iso&tlng=en | 20200214192151 | 200 | + pdf | https://doi.org/10.4067/s0718-23762005000100015 | 2020-02-22 23:53:55.81526+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-23762005000100015&lng=en&nrm=iso&tlng=en | 20200214173237 | 200 | + + Look like web/xml only. + + TODO: XML ingest (and replay?) support. These are as "<article>", not sure if that is JATS or what. + +www.kci.go.kr | no-pdf-link | 6842 +www.m-hikari.com | no-pdf-link | 6763 +cshprotocols.cshlp.org | no-pdf-link | 6553 +www.bibliotekevirtual.org | no-pdf-link | 6309 +data.hpc.imperial.ac.uk | no-pdf-link | 6071 +projecteuclid.org | link-loop | 5970 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'projecteuclid.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%projecteuclid.org%' + AND status = 'link-loop' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + -------------------+-------------------------+------- + projecteuclid.org | link-loop | 5985 + projecteuclid.org | success | 26 + projecteuclid.org | wayback-error | 26 + projecteuclid.org | wrong-mimetype | 17 + projecteuclid.org | spn2-cdx-lookup-failure | 4 + projecteuclid.org | other-mimetype | 4 + projecteuclid.org | no-capture | 3 + projecteuclid.org | terminal-bad-status | 2 + projecteuclid.org | spn2-error:job-failed | 1 + projecteuclid.org | spn-remote-error | 1 + (10 rows) + + Doing a cookie check and redirect. + + TODO: brozzler behavior to "click the link" instead? + +www.scielo.br | no-pdf-link | 5823 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'www.scielo.br' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%www.scielo.br%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + ---------------+-------------------------+------- + www.scielo.br | success | 35150 + www.scielo.br | no-pdf-link | 5839 + www.scielo.br | terminal-bad-status | 429 + www.scielo.br | no-capture | 189 + www.scielo.br | wrong-mimetype | 7 + www.scielo.br | spn2-cdx-lookup-failure | 2 + (6 rows) + + Seems to just be the subset with no PDFs. + +get.iedadata.org | no-pdf-link | 5822 +www.pdcnet.org | no-pdf-link | 5798 +publications.rwth-aachen.de | no-pdf-link | 5323 +www.sciencedomain.org | no-pdf-link | 5231 +medicalforum.ch | terminal-bad-status | 4574 +jrnl.nau.edu.ua | link-loop | 4145 +ojs.academypublisher.com | no-pdf-link | 4017 + +## MAG bulk ingest + +- dialnet.unirioja.es | redirect-loop | 240967 + dialnet.unirioja.es | terminal-bad-status | 20320 + => may be worth re-crawling via heritrix? +- agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639 + => and other *.onlinelibrary.wiley.com +- www.researchgate.net | redirect-loop | 42859 +- www.redalyc.org:9081 | no-pdf-link | 10515 +- www.repository.naturalis.nl | redirect-loop | 8213 +- bjp.rcpsych.org | link-loop | 8045 +- journals.tubitak.gov.tr | wrong-mimetype | 7159 +- www.erudit.org | redirect-loop | 6819 +- papers.ssrn.com | redirect-loop | 27328 + => blocking is pretty aggressive, using cookies or referrer or something. + maybe a brozzler behavior would work, but doesn't currently + +## Out of Scope + +Datasets only? + +- plutof.ut.ee +- www.gbif.org +- doi.pangaea.de +- www.plate-archive.org + +Historical non-paper content: + +- dhz.uni-passau.de (newspapers) +- digital.ucd.ie (irish historical) + +Mostly datasets (some PDF content): + +- *.figshare.com +- zenodo.com +- data.mendeley.com diff --git a/notes/possible_ingest_targets.txt b/notes/possible_ingest_targets.txt new file mode 100644 index 0000000..fcdc3e4 --- /dev/null +++ b/notes/possible_ingest_targets.txt @@ -0,0 +1,15 @@ + +- all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5 + +more complex crawling/content: +- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 +- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url +- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data) +- doi.ala.org.au: possible dataset ingest source +- peerj.com, at least reviews, should be HTML ingest? or are some PDF? +- publons.com should be HTML ingest, possibly special case for scope +- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug + +other tasks: +- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512 +- push/deploy sandcrawler changes diff --git a/notes/tasks/2020-10-21_pdfextract_holes.md b/notes/tasks/2020-10-21_pdfextract_holes.md new file mode 100644 index 0000000..c0bb65e --- /dev/null +++ b/notes/tasks/2020-10-21_pdfextract_holes.md @@ -0,0 +1,74 @@ + +Realized I had not enabled persisting of PDF extraction results (thumbnail, +text) in ingest worker when added over the summer. So now need to run a +catch-up. This applied to both "live" and "bulk" ingest. + +## `cdx` / `ingest` / `grobid` catch-up + +First, re-run extraction for cases where we did an ingest, and grobid ran +successfully, and we have a CDX row, but no `pdf_meta`: + + -- this is a slow query + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE cdx.sha1hex IS NOT NULL + --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL + ) + TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json' + WITH NULL ''; + => 19,676,116 + +Wow, that is a lot. Many from recent OAI-PMH and OA crawls, presumably. + + cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +And again, after a couple partitions got hung up: + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE cdx.sha1hex IS NOT NULL + --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL + ) + TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json' + WITH NULL ''; + + + cat /grande/snapshots/dump_unextracted_pdf.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + => 562k 0:00:16 [34.6k/s] + +## `petabox` / `grobid` catch-up + +These didn't all seem to extract correctly before after 1.5m rows, there will +still 900k unprocessed. Trying again. + + COPY ( + SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) + FROM grobid + LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE petabox.sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL + ) + TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-11-04.json' + WITH NULL ''; + + cat /grande/snapshots/dump_unextracted_pdf_petabox.ingest.2020-11-04.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +## `cdx` / `grobid` catch-up + +Next will be to process PDFs with GROBID and CDX but no ingest. + diff --git a/notes/tasks/2021-09-09_pdf_url_lists.md b/notes/tasks/2021-09-09_pdf_url_lists.md new file mode 100644 index 0000000..cd8176e --- /dev/null +++ b/notes/tasks/2021-09-09_pdf_url_lists.md @@ -0,0 +1,70 @@ + +Want to dump a URL list to share with partners, filtered to content we think is +likely to be scholarly. + +Columns to include: + +- original URL +- capture timestamp +- SHA1 + +## Stats Overview + +file_meta table, mimetype=application/pdf: 173,816,433 + +cdx table, mimetype=application/pdf: 131,346,703 + +ingest_file_result table, pdf, success: 66,487,928 + +## Ingested PDF URLs + +"Ingested" URLs: ingest_file_result table, pdf and hit=true; include base URL also? + + COPY ( + SELECT + base_url as start_url, + terminal_url as pdf_url, + terminal_dt as pdf_url_timestamp, + terminal_sha1hex as pdf_sha1hex + FROM ingest_file_result + WHERE + ingest_type = 'pdf' + AND status = 'success' + ) + TO '/srv/sandcrawler/tasks/wayback_pdf_targeted.2021-09-09.tsv' + WITH NULL ''; + => 77,892,849 + +## CDX PDFs + +"All web PDFs": CDX query; left join file_meta, but don't require + + COPY ( + SELECT + cdx.url as pdf_url, + cdx.datetime as pdf_url_timestamp, + cdx.sha1hex as pdf_sha1hex + FROM cdx + LEFT JOIN file_meta + ON + cdx.sha1hex = file_meta.sha1hex + WHERE + file_meta.mimetype = 'application/pdf' + OR ( + file_meta.mimetype IS NULL + AND cdx.mimetype = 'application/pdf' + ) + ) + TO '/srv/sandcrawler/tasks/wayback_pdf_speculative.2021-09-09.tsv' + WITH NULL ''; + => 147,837,935 + +## Processed web PDFs + +"Parsed web PDFs": `file_meta`, left join CDX + +(didn't do this one) + +--- + +Uploaded all these to <https://archive.org/download/ia_scholarly_urls_2021-09-09> diff --git a/notes/tasks/2021-10-29_crossref_refs_backfill.md b/notes/tasks/2021-10-29_crossref_refs_backfill.md new file mode 100644 index 0000000..94eefec --- /dev/null +++ b/notes/tasks/2021-10-29_crossref_refs_backfill.md @@ -0,0 +1,235 @@ + +The current sandcrawler-db crossref table was backfilled from a 2021-01 +snapshot, and has not been updated since. + +Would like to use the existing fatcat Kafka feed to keep the crossref table up +to date, and also backfill in GROBID reference parsing of all `unstructured` +references. + +Current plan is: + +1. use kafkacat CLI to dump crossref Kafka topic, from the begining of 2021 up + to some recent date +2. use `persist_tool.py`, with a large batch size (200?) to backfill this dump + into sandcrawler-db. this will update some rows multiple times (if there + have been updates) +3. dump the full crossref table, as a point-in-time snapshot +4. filter to crossref records that have `unstrutured` references in them (at + all) +5. use `grobid_tool.py` with `parallel` to batch process references +6. backfill these refs using a simple SQL COPY statement +7. deploy crossref persist worker, with ref updates on, and roll the consumer + group back to date of dump +8. wait for everything to catch up + + +## Commands + +Get a timestamp in milliseconds: + + 2021-01-01 is: + 1609488000 in unix time (seconds) + 1609488000000 in miliseconds + +Hrm, oldest messages seem to actually be from 2021-04-28T19:21:10Z though. Due +to topic compaction? Yup, we have a 180 day compaction policy on that topic, +probably from when kafka space was tight. Oh well! + +Updated retention for this topic to `46656000000` (~540 days, ~18 months) using +`kafka-manager` web app. + + kafkacat -C -b wbgrp-svc263.us.archive.org -t fatcat-prod.api-crossref -o s@1609488000000 \ + | pv -l \ + | gzip \ + > crossref_feed_start20210428_end20211029.json.gz + +This resulted in ~36 million rows, 46GB. + +`scp` that around, then run persist on `sandcrawler-db`: + + # in pipenv, as sandcrawler user + # manually edited to set batch size to 200 + zcat /srv/sandcrawler/tasks/crossref_feed_start20210428_end20211029.json.gz \ + | pv -l \ + | ./persist_tool.py crossref - + => 36.8M 11:02:43 [ 925 /s] + +With a single thread, the persist process runs at about 1,000 rows/sec, which +works out to about 10 hours for 36 million rows. + +At the start of this process, total PostgreSQL database size is 832.21G. At the +end, 902.51G. Have not run a `VACUUM ALL` or anything like that. + +Query to dump crossref rows which have any refs and compress output with pigz: + + # dump_crossref.sql + COPY ( + SELECT record + FROM crossref + WHERE record::jsonb @? '$.reference[*].unstructured' + -- LIMIT 5 + ) + TO STDOUT + WITH NULL ''; + + # 'sed' required because of double quote escaping in postgresql output:: + # https://stackoverflow.com/questions/29869983/postgres-row-to-json-produces-invalid-json-with-double-escaped-quotes/29871069 + # 'rg' filter is just being conservative + + # XXX: next time add to the pipeline: rg -v "\\\\" + # or, find some way to filter/transform this kind of SQL export better? + psql sandcrawler < dump_crossref.sql \ + | sed 's/\\"/\"/g' \ + | rg '^\{' \ + | pv -l \ + | pigz \ + > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz + => 26.1M 3:22:51 [2.15k/s] + + # NOTE: -j40 is for production run with ~dedicated GROBID server with many cores + zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.json.gz \ + | rg -v "\\\\" \ + | parallel -j35 --linebuffer --round-robin --pipe ./grobid_tool.py --grobid-host http://wbgrp-svc096.us.archive.org:8070 parse-crossref-refs - \ + | pv -l \ + | pigz \ + > /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz + + # from earlier testing with -j40: able to do about 300-500 records/second + # 23.9k 0:01:14 [ 320 /s] + # 134518 total refs parsed + # ~1817 refs/second parsed + + # with errors, got through about: 2.08M 1:38:20 [ 352 /s] + # was still seing bad JSON? + # JSON lines pushed: Counter({'total': 105898, 'pushed': 105886, 'error-json-decode': 12}) + + # finally, without errors: + # 18.6M 8:35:02 [ 603 /s] + +In the next step, going to need a small direct persist worker to copy lines +verbatim into just the `grobid_refs` table. + +## Errors + +Got errors when running for real: + + xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 114, column 33 + + requests.exceptions.HTTPError: 500 Server Error: Internal Server Error for url: http://wbgrp-svc096.us.archive.org:8070/api/processCitationList + + urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='wbgrp-svc096.us.archive.org', port=8070): Max retries exceeded with url: /api/processCitationList (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f54b0a3bd00>: Failed to establish a new connection: [Errno 99] Cannot assign requested address')) + + + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ERROR [2021-11-03 06:57:32,569] org.grobid.service.process.GrobidRestProcessString: An unexpected exception occurs. + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! java.lang.NullPointerException: null + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.data.BiblioItem.cleanTitles(BiblioItem.java:1784) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingLayoutTokenMultiple(CitationParser.java:175) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.CitationParser.processingStringMultiple(CitationParser.java:92) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.core.engines.Engine.processRawReferences(Engine.java:168) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.process.GrobidRestProcessString.processCitationList(GrobidRestProcessString.java:316) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at org.grobid.service.GrobidRestService.processCitationListReturnXml_post(GrobidRestService.java:581) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.GeneratedMethodAccessor19.invoke(Unknown Source) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + Nov 03 06:57:32 wbgrp-svc096.us.archive.org GROBID[400404]: ! at java.lang.reflect.Method.invoke(Method.java:498) + [...] + +Bogus example reference causing 500 error (among other non-error citations) (doi:10.5817/cz.muni.m210-9541-2019): + + 'Müller, R., Šidák, P. (2012). Slovník novější literární teorie. Praha: Academia.' + '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0' + 'Šotkovská, J. (2008). Rané divadelní hry Milana Uhdeho; diplomová práce. Brno: Masarykova univerzita.', + +s.strip() in python would remove these non-breaking spaces (update: implemented this later) + + Maheswari, S., Vijayalakshmi, C.: Optimization Model for Electricity Distribution System Control using Communication System by La-grangian Relaxation Technique. CiiT International Journal of Wireless Communication 3(3), 183–187 (2011) (Print: ISSN 0974 – 9756 & Online: ISSN 0974 – 9640) + +Also: + + truncating very large reference list for doi:10.1017/chol9780521264303.033 len:2281 + truncating very large reference list for doi:10.1017/chol9780521263351.011 len:3129 + truncating very large reference list for doi:10.1017/chol9780521263351.022 len:2968 + truncating very large reference list for doi:10.1017/chol9780521264303.036 len:2221 + truncating very large reference list for doi:10.1017/chol9780521264303.007 len:2238 + truncating very large reference list for doi:10.1017/chol9780521086912.001 len:2177 + truncating very large reference list for doi:10.1017/chol9780521228046.002 len:2133 + truncating very large reference list for doi:10.1017/chol9780521264303.035 len:2221 + truncating very large reference list for doi:10.1017/chol9780521264303.002 len:2279 + +Seems like bumping to 2500 as the maximum reference list size might be +reasonable (it is 2000 currently). + +After some refactoring, still getting: + + requests.exceptions.ConnectionError + +This is because I am doing POST without a session. + +Then, still got requests.exceptions.ReadTimeout + +Finally, got through the whole batch, (`18.6M 8:35:02 [ 603 /s]` output), with +only a few dozen rows like: + + GROBID returned bad XML for Crossref DOI: 10.1007/978-3-030-03008-7_21-1 + GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1496-8_3 + GROBID HTTP timeout for Crossref DOI: 10.1007/978-1-4757-1493-7_3 + GROBID returned bad XML for Crossref DOI: 10.1007/978-3-319-96184-2_2 + GROBID returned bad XML for Crossref DOI: 10.1063/1.5031970 + truncating very large reference list for doi:10.1007/978-1-4757-1499-9_15 len:11401 + GROBID returned bad XML for Crossref DOI: 10.1016/j.oraloncology.2019.104562 + GROBID returned bad XML for Crossref DOI: 10.1016/j.pec.2020.04.010 + +So things seem to be working! + +Summary lines looked like: + + JSON lines pushed: Counter({'total': 531487, 'pushed': 531487}) + Worker: Counter({'total': 536541, 'failed': 3}) + +Failures per batch were on the order of 0 to 3. + +## Postgres Backfill + +Start with a sample: + + zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \ + | head -n1000 \ + | ./persist_tool.py grobid-refs - + # Worker: Counter({'total': 1000, 'insert-grobid_refs': 1000, 'update-grobid_refs': 0}) + + # same command again: + # Worker: Counter({'total': 1000, 'update-grobid_refs': 1000, 'insert-grobid_refs': 0}) + +Example DOIs: + + # no refs + 10.1007/978-1-349-04135-0_3 + http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-04135-0_3" + + # with refs + 10.1007/978-1-349-03594-6_2 + http get :3030/crossref_with_refs "doi==eq.10.1007/978-1-349-03594-6_2" + +Seems to be working, so will do the full backfill. Can check table sizes on a +per-table basis when complete. + + zcat /srv/sandcrawler/tasks/crossref_sandcrawler_unstructured.grobid_refs.json.gz \ + | pv -l \ + | ./persist_tool.py grobid-refs - + # Worker: Counter({'total': 18646668, 'insert-grobid_refs': 18639195, 'update-grobid_refs': 7473}) + + +## Kafka Setup + +Added ansible config and deployed persist-crossref worker. + +First roll-back just a couple days as a test: + + ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-11-07T00:00:00.000 + + # eg: Import counts: Counter({'total': 372350, 'insert-grobid_refs': 326987, 'update-crossref': 265581, 'insert-crossref': 106769, 'update-grobid_refs': 45362, 'skip': 1}) + +Then roll-back to before the snapshot and backfill, to catch up: + + ./kafka-consumer-groups.sh --bootstrap-server localhost:9092 --group persist-crossref --reset-offsets --topic fatcat-prod.api-crossref --to-datetime 2021-10-26T00:00:00.000 + +Ran this last command on 2021-11-10, and total lag was around 2,566,741. diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md new file mode 100644 index 0000000..5fb69d1 --- /dev/null +++ b/notes/tasks/2021-12-06_regrobid.md @@ -0,0 +1,380 @@ + +Want to test recent updates of GROBID (to fix regex issue), and also re-process +a number of PDFs which failed to process with GROBID initially. + + +## HTTP 503 + +These are attempts which failed because GROBID was too busy or not running. + + # IMPROVED BELOW + COPY ( + SELECT row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.status_code = 503 + AND cdx.sha1hex IS NOT NULL + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json' + WITH NULL ''; + # COPY 4749 + +Not actually that many, which seems good. Confirm that these are uniq by sha1hex: + + cat ungrobided_fatcat.2021-12-06.grobid503.json | jq .sha1hex -r | sort | uniq -d | wc -l + # 302 + +Nope! Need to add "distinct on": + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.status_code = 503 + AND cdx.sha1hex IS NOT NULL + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json' + WITH NULL ''; + # COPY 4297 + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid503.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +## Never Processed CDX + +PDFs in fatcat which have never been processed with GROBID. + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM fatcat_file + LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex + LEFT JOIN grobid ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex + WHERE + grobid.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL) + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json' + WITH NULL ''; + # COPY 15488 + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + +PDFs in fatcat which have never been processed with pdfextract. + + # TODO + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM fatcat_file + LEFT JOIN cdx ON fatcat_file.sha1hex = cdx.sha1hex + LEFT JOIN pdf_meta ON pdf_meta.sha1hex = fatcat_file.sha1hex + LEFT JOIN file_meta ON file_meta.sha1hex = fatcat_file.sha1hex + WHERE + pdf_meta.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND cdx.mimetype = 'application/pdf' + AND (file_meta.mimetype = 'application/pdf' OR file_meta.mimetype IS NULL) + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json' + WITH NULL ''; + # COPY 45535 + + cat /srv/sandcrawler/tasks/unextracted_fatcat.2021-12-08.cdx.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + # 45.5k 0:00:01 [30.2k/s] + +## Timeout or Failure + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex + WHERE + (grobid.status_code = 500 OR grobid.status_code = -4) + AND cdx.sha1hex IS NOT NULL + AND file_meta.mimetype = 'application/pdf' + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json' + WITH NULL ''; + # COPY 8,084,296 + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-06.grobid_failed.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +This seems to not be working very well, mostly errors, empty docs, etc. Will +roll-forward the kafka consumer group after attempting a couple hundred +thousand of these. + +Let's try limiting to files actually in fatcat: + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN file_meta ON grobid.sha1hex = file_meta.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + (grobid.status_code = 500 OR grobid.status_code = -4) + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + AND file_meta.mimetype = 'application/pdf' + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json' + WITH NULL ''; + # COPY 529265 + +That is a much more managable batch to retry. + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-08.grobid_failed.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 529k 0:00:17 [31.0k/s] + + +## Missing Fatcat Files + +There were around a half million fatcat file entities which didn't have `cdx` +rows in sandcrawler. Did some specific pdfextract processing; now we should do +GROBID ingest as well. + +Enque the `CDX` objects for GROBID and pdfextract processing: + + zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 354k 0:00:11 [30.6k/s] + + zcat /schnell/fatcat_cleanups/file_meta/files_missing_sha256.cdx_rows.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +And some earlier files of interest on `aitio`: + + cat files_missing_sha256.ingest_results.json \ + | rg '"application/pdf"' \ + | rg -v "\\\\" \ + | jq .cdx -c \ + | sort -u -S 4G \ + | pv -l \ + > files_missing_sha256.cdx.uniq.json + # 100k 0:00:47 [2.09k/s] + + cat files_missing_sha256.cdx.uniq.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + cat files_missing_sha256.cdx.uniq.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + + +## Ancient Fatcat Files + +Files from an era where we didn't record GROBID version or status, even for +success. + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + grobid.status_code = 200 + AND grobid.status IS NULL + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json' + WITH NULL ''; + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 107k 0:00:03 [29.9k/s] + + +## Start Re-Processing Old GROBID Versions + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + grobid.status = 'success' + AND grobid.grobid_version NOT LIKE '0.7.%' + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json' + WITH NULL ''; + +This one is huge, and want to process in batches/chunks of ~8 million at a time. + + cd /srv/sandcrawler/tasks/ + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \ + | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json + +Submit individual batches like: + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +Overall progress: + + x ungrobided_fatcat.2021-12-11.grobid_old.split_00.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_01.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_02.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_03.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_04.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_05.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_06.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_07.json + x ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small) + +This finally finished on 2022-04-26. Horray! + +## General Counts + +How many fatcat files of what mimetype (reported in sandcrawler-db)? + + SELECT file_meta.mimetype, COUNT(*) + FROM fatcat_file + LEFT JOIN file_meta ON fatcat_file.sha1hex = file_meta.sha1hex + WHERE + fatcat_file.first_release_ident IS NOT NULL + AND fatcat_file.any_url = true + AND content_scope IS NULL + GROUP BY file_meta.mimetype + ORDER BY COUNT(*) DESC + LIMIT 25; + + mimetype | count + ---------------------------------------------------------------------------+---------- + application/pdf | 45227033 + | 433068 + application/octet-stream | 30634 + application/jats+xml | 6874 + text/html | 876 + application/postscript | 199 + application/gzip | 173 + text/plain | 84 + application/xml | 48 + application/vnd.ms-powerpoint | 38 + application/msword | 16 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 8 + image/jpeg | 6 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 4 + message/rfc822 | 4 + application/zip | 4 + text/x-tex | 3 + application/x-dosexec | 3 + application/x-tar | 2 + application/vnd.ms-tnef | 2 + image/svg+xml | 1 + image/tiff | 1 + image/png | 1 + image/gif | 1 + application/vnd.ms-office | 1 + (25 rows) + + +PDF extract status? + + SELECT pdf_meta.status, COUNT(*) + FROM fatcat_file + LEFT JOIN pdf_meta ON fatcat_file.sha1hex = pdf_meta.sha1hex + WHERE + fatcat_file.first_release_ident IS NOT NULL + AND fatcat_file.any_url = true + AND content_scope IS NULL + GROUP BY pdf_meta.status + ORDER BY COUNT(*) DESC + LIMIT 25; + + status | count + ----------------+---------- + success | 43415920 + | 2018522 + text-too-large | 122730 + parse-error | 94876 + not-pdf | 32156 + error-wayback | 14504 + bad-unicode | 279 + bad-pdf | 98 + empty-blob | 2 + (9 rows) + + +What are the GROBID status codes for fatcat files? Narrowed down: + + SELECT grobid.status, grobid.status_code, COUNT(*) + FROM fatcat_file + LEFT JOIN grobid ON fatcat_file.sha1hex = grobid.sha1hex + WHERE + fatcat_file.first_release_ident IS NOT NULL + AND fatcat_file.any_url = true + AND content_scope IS NULL + GROUP BY grobid.status, grobid.status_code + ORDER BY COUNT(*) DESC + LIMIT 25; + + status | status_code | count + ----------------+-------------+---------- + success | 200 | 44409069 + error | 500 | 580402 + | | 468836 + | 200 | 240660 + error-timeout | -4 | 79 + bad-grobid-xml | 200 | 38 + error | 200 | 3 + (7 rows) + +Ran the same query again on 2021-12-15: + + status | status_code | count + ----------------+-------------+---------- + success | 200 | 45092915 + error | 500 | 302373 + | | 250335 + | 200 | 53352 + bad-grobid-xml | 200 | 39 + error-timeout | -4 | 37 + error | 200 | 34 + error | 503 | 2 + (8 rows) diff --git a/notes/tasks/2022-01-07_grobid_platform_pdfs.md b/notes/tasks/2022-01-07_grobid_platform_pdfs.md new file mode 100644 index 0000000..b5422c2 --- /dev/null +++ b/notes/tasks/2022-01-07_grobid_platform_pdfs.md @@ -0,0 +1,23 @@ + +Martin crawled more than 10 million new PDFs from various platform domains. We +should get these processed and included in sandcrawler-db. + +## Select CDX Rows + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM cdx + LEFT JOIN grobid ON grobid.sha1hex = cdx.sha1hex + WHERE + grobid.sha1hex IS NULL + AND cdx.sha1hex IS NOT NULL + AND cdx.warc_path LIKE 'PLATFORM-CRAWL-2020%' + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json' + WITH NULL ''; + => COPY 8801527 + + cat /srv/sandcrawler/tasks/ungrobided_platform_crawl.2022-01-07.cdx.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + + # for pdfextract, would be: sandcrawler-prod.unextracted diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md new file mode 100644 index 0000000..c727a57 --- /dev/null +++ b/notes/tasks/2022-03-07_ukraine_firedrill.md @@ -0,0 +1,225 @@ + +Want to do priority crawling of Ukranian web content, plus Russia and Belarus. + + +## What is Missing? + + (country_code:ua OR lang:uk) + => 2022-03-08, before ingests: 470,986 total, 170,987 missing, almost all article-journal, peak in 2019, 55k explicitly OA + later in day, already some 22k missing found! wow + => 2022-04-04, after ingests: 476,174 total, 131,063 missing, 49k OA missing + +## Metadata Prep + +- container metadata update (no code changes) + x wikidata SPARQL update + x chocula run + x journal metadata update (fatcat) + x update journal stats (fatcat extra) +- DOAJ article metadata import + x prep and upload single JSON file + + +## Journal Homepage URL Crawl + +x dump ukraine-related journal homepages from chocula DB +x create crawl config +x start crawl +x repeat for belarus and russia + + + python3 -m chocula export_urls > homepage_urls.2022-03-08.tsv + cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ua/' | sort -u > homepage_urls.2022-03-08.ua_tld.tsv + wc -l homepage_urls.2022-03-08.ua_tld.tsv + 1550 homepage_urls.2022-03-08.ua_tld.tsv + + cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.by/' | sort -u > homepage_urls.2022-03-08.by_tld.tsv + cat homepage_urls.2022-03-08.tsv | cut -f2 | rg '\.ru/' | sort -u > homepage_urls.2022-03-08.ru_tld.tsv + +sqlite3: + + select count(*) from journal where country = 'ua' or lang = 'uk' or name like '%ukrain%' or publi + 1952 + + SELECT COUNT(*) FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'ua' + OR journal.lang = 'uk' + OR journal.name like '%ukrain%' + OR journal.publisher like '%ukrain%'; + => 1970 + + .mode csv + .once homepage_urls_ukraine.tsv + SELECT homepage.url FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'ua' + OR journal.lang = 'uk' + OR journal.name like '%ukrain%' + OR journal.publisher like '%ukrain%'; + + .mode csv + .once homepage_urls_russia.tsv + SELECT homepage.url FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'ru' + OR journal.lang = 'ru' + OR journal.name like '%russ%' + OR journal.publisher like '%russ%'; + + .mode csv + .once homepage_urls_belarus.tsv + SELECT homepage.url FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + journal.country = 'by' + OR journal.lang = 'be' + OR journal.name like '%belarus%' + OR journal.publisher like '%belarus%'; + + cat homepage_urls_ukraine.tsv homepage_urls.2022-03-08.ua_tld.tsv | sort -u > homepage_urls_ukraine_combined.2022-03-08.tsv + + wc -l homepage_urls.2022-03-08.ua_tld.tsv homepage_urls_ukraine.tsv homepage_urls_ukraine_combined.2022-03-08.tsv + 1550 homepage_urls.2022-03-08.ua_tld.tsv + 1971 homepage_urls_ukraine.tsv + 3482 homepage_urls_ukraine_combined.2022-03-08.tsv + + cat homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv | sort -u > homepage_urls_russia_combined.2022-03-08.tsv + + wc -l homepage_urls_russia.tsv homepage_urls.2022-03-08.ru_tld.tsv homepage_urls_russia_combined.2022-03-08.tsv + 3728 homepage_urls_russia.tsv + 2420 homepage_urls.2022-03-08.ru_tld.tsv + 6030 homepage_urls_russia_combined.2022-03-08.tsv + + + cat homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv | sort -u > homepage_urls_belarus_combined.2022-03-08.tsv + + wc -l homepage_urls_belarus.tsv homepage_urls.2022-03-08.by_tld.tsv homepage_urls_belarus_combined.2022-03-08.tsv + 138 homepage_urls_belarus.tsv + 85 homepage_urls.2022-03-08.by_tld.tsv + 222 homepage_urls_belarus_combined.2022-03-08.tsv + + +## Landing Page Crawl + +x create crawl config +x fatcat ingest query for related URLs + => special request code/label? +x finish .by and .ru article URL dump, start crawling +x URL list filtered from new OAI-PMH feed + => do we need to do full bulk load/dump, or not? +- URL list from partner (google) +- do we need to do alternative thing of iterating over containers, ingesting each? + + ./fatcat_ingest.py --env prod \ + --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-bulk \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:ua OR lang:uk" + + # around Tue 08 Mar 2022 01:07:37 PM PST + # Expecting 185659 release objects in search queries + # didn't complete successfully? hrm + + # ok, retry "manually" (with kafkacat) + ./fatcat_ingest.py --env prod \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:ua OR lang:uk" \ + | pv -l \ + | gzip \ + > /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json + # Counter({'elasticsearch_release': 172881, 'estimate': 172881, 'ingest_request': 103318}) + # 103k 0:25:04 [68.7 /s] + + zcat /srv/fatcat/ingest_ua_pdfs.2022-03-08.requests.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + zcat ingest_ua_pdfs.2022-03-08.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ua_pdfs.2022-03-08.txt.gz + # 103k 0:00:02 [38.1k/s] + + ./fatcat_ingest.py --env prod \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:by OR lang:be" \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz + # Expecting 2266 release objects in search queries + # 1.29k 0:00:34 [37.5 /s] + + zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + zcat ingest_by_pdfs.2022-03-09.requests.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_by_pdfs.2022-03-09.txt.gz + + ./fatcat_ingest.py --env prod \ + --ingest-type pdf \ + --allow-non-oa \ + query "country_code:ru OR lang:ru" \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.json.gz + # Expecting 1515246 release objects in search queries + + zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + zcat ingest_ru_pdfs.2022-03-09.requests.partial.json.gz | jq .base_url -r | sort -u | pv -l | gzip > ingest_ru_pdfs.2022-03-09.txt.gz + + + zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ua/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ua_tld.txt + # 309k 0:00:03 [81.0k/s] + + zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.by/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.by_tld.txt + # 71.2k 0:00:03 [19.0k/s] + + zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt + # 276k 0:00:03 [72.9k/s] + + +### Landing Page Bulk Ingest + +Running these 2022-03-24, after targeted crawl completed: + + zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 103k 0:00:02 [36.1k/s] + + zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 1.29k 0:00:00 [15.8k/s] + + zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 546k 0:00:13 [40.6k/s] + +It will probably take a week or more for these to complete. + + +## Outreach + +- openalex +- sucho.org +- ceeol.com diff --git a/notes/tasks/2022-04-27_pdf_url_lists.md b/notes/tasks/2022-04-27_pdf_url_lists.md new file mode 100644 index 0000000..273ff32 --- /dev/null +++ b/notes/tasks/2022-04-27_pdf_url_lists.md @@ -0,0 +1,72 @@ + +Another dump of PDF URLs for partners. This time want to provide TSV with full +wayback download URLs, as well as "access" URLs. + + export TASKDATE=2022-04-27 + +## "Ingested", AKA, "Targetted" PDF URLs + +These are URLs where we did a successful ingest run. + + COPY ( + SELECT + terminal_sha1hex as pdf_sha1hex, + ('https://web.archive.org/web/' || terminal_dt || 'id_/' || terminal_url) as crawl_url, + ('https://web.archive.org/web/' || terminal_dt || '/' || terminal_url) as display_url + FROM ingest_file_result + WHERE + ingest_type = 'pdf' + AND status = 'success' + AND hit = true + ORDER BY terminal_sha1hex ASC + -- LIMIT 10; + ) + TO '/srv/sandcrawler/tasks/ia_wayback_pdf_ingested.2022-04-27.tsv' + WITH NULL ''; + => COPY 85712674 + +May contain duplicates, both by sha1hex, URL, or both. + +Note that this could be filtered by timestamp, to make it monthly/annual. + + +## All CDX PDFs + +"All web PDFs": CDX query; left join file_meta, but don't require + + COPY ( + SELECT + cdx.sha1hex as pdf_sha1hex, + ('https://web.archive.org/web/' || cdx.datetime || 'id_/' || cdx.url) as crawl_url, + ('https://web.archive.org/web/' || cdx.datetime || '/' || cdx.url) as display_url + FROM cdx + LEFT JOIN file_meta + ON + cdx.sha1hex = file_meta.sha1hex + WHERE + file_meta.mimetype = 'application/pdf' + OR ( + file_meta.mimetype IS NULL + AND cdx.mimetype = 'application/pdf' + ) + ORDER BY cdx.sha1hex ASC + -- LIMIT 10; + ) + TO '/srv/sandcrawler/tasks/ia_wayback_pdf_speculative.2022-04-27.tsv' + WITH NULL ''; + => COPY 161504070 + +Should be unique by wayback URL; may contain near-duplicates or duplicates by + +## Upload to archive.org + +TODO: next time compress these files first (gzip/pigz) + +ia upload ia_scholarly_urls_$TASKDATE \ + -m collection:ia_biblio_metadata \ + -m title:"IA Scholarly URLs ($TASKDATE)" \ + -m date:$TASKDATE \ + -m creator:"Internet Archive Web Group" \ + -m description:"URL lists to PDFs on the web (and preserved in the wayback machine) which are likely to contain research materials." \ + /srv/sandcrawler/tasks/ia_wayback_pdf_ingested.$TASKDATE.tsv /srv/sandcrawler/tasks/ia_wayback_pdf_speculative.$TASKDATE.tsv + diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md new file mode 100644 index 0000000..74d3857 --- /dev/null +++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md @@ -0,0 +1,132 @@ + +Had a huge number of SPN requests for the andrzejklimczuk.com domain, +presumably from the author. + +Many were duplicates (same file, multiple releases, often things like zenodo +duplication). Many were also GROBID 500s, due to truncated common crawl +captures. + +Needed to cleanup! Basically sorted through a few editgroups manually, then +rejected all the rest and manually re-submitted with the below queries and +commands: + + SELECT COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'; + => 589 + + SELECT ingest_file_result.status, COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + GROUP BY ingest_file_result.status; + + status | count + ----------------+------- + cdx-error | 1 + success | 587 + wrong-mimetype | 1 + (3 rows) + + + SELECT grobid.status_code, COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + GROUP BY grobid.status_code; + + status_code | count + -------------+------- + 200 | 385 + 500 | 202 + | 2 + (3 rows) + + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + AND ingest_file_result.status = 'success' + AND grobid.status_code = 500 + ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json'; + => COPY 202 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + AND ingest_file_result.status = 'success' + AND grobid.status_code = 200 + ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json'; + => COPY 385 + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \ + > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ + | jq '. + {force_recrawl: true}' -c \ + > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ + | shuf \ + | head -n100 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ + | shuf \ + | head -n10000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ + > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 |