diff options
Diffstat (limited to 'notes')
-rw-r--r-- | notes/dryad_datasets.md | 17 | ||||
-rw-r--r-- | notes/examples/2021-11-12_broken_grobid_xml.md | 83 | ||||
-rw-r--r-- | notes/examples/dataset_examples.txt | 52 | ||||
-rw-r--r-- | notes/examples/html_test_journals.txt | 153 | ||||
-rw-r--r-- | notes/examples/random_datasets.md | 19 | ||||
-rw-r--r-- | notes/ingest/2021-09-02_oai_pmh_patch.md | 4 | ||||
-rw-r--r-- | notes/ingest/2022-03_oaipmh.md | 40 | ||||
-rw-r--r-- | notes/ingest/2022-07-19_dblp.md | 50 | ||||
-rw-r--r-- | notes/ingest/2022-07_doaj.md | 199 | ||||
-rw-r--r-- | notes/ingest/2022-07_targeted.md | 140 | ||||
-rw-r--r-- | notes/ingest/2022-09_oaipmh.md | 397 | ||||
-rw-r--r-- | notes/ingest_domains.txt | 294 | ||||
-rw-r--r-- | notes/possible_ingest_targets.txt | 15 | ||||
-rw-r--r-- | notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md | 132 |
14 files changed, 1593 insertions, 2 deletions
diff --git a/notes/dryad_datasets.md b/notes/dryad_datasets.md new file mode 100644 index 0000000..5c727b1 --- /dev/null +++ b/notes/dryad_datasets.md @@ -0,0 +1,17 @@ + +api docs: https://datadryad.org/api/v2/docs + +current search queries return 38,000 hits (December 2020) + +exmaple with multiple versions: + https://datadryad.org/stash/dataset/doi:10.5061/dryad.fbg79cnr0 + https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0 + https://datadryad.org/api/v2/datasets/doi%3A10.5061%2Fdryad.fbg79cnr0/versions + + +how to handle versions? DOI doesn't get incremented. + +on archive.org, could have separate item for each version, or sub-directories within item, one for each version + +in fatcat, could have a release for each version, but only one with +the DOI; or could have a separate fileset for each version diff --git a/notes/examples/2021-11-12_broken_grobid_xml.md b/notes/examples/2021-11-12_broken_grobid_xml.md new file mode 100644 index 0000000..5223651 --- /dev/null +++ b/notes/examples/2021-11-12_broken_grobid_xml.md @@ -0,0 +1,83 @@ + +Find all the PDFs from web which resulted in `bad-grobid-xml` status code (among others): + + sql> select * from grobid where status != 'success' and status_code != 500 and status_code != 503 and status != 'error-timeout' limit 100; + + sha1hex | updated | grobid_version | status_code | status | fatcat_release | metadata + ------------------------------------------+-------------------------------+----------------+-------------+----------------+----------------+------------------------------------------------------------------------ + d994efeea3b653e2dbe8e13e5a6d203e9b9484ab | 2020-03-20 04:04:40.093094+00 | | 200 | error | | {"error_msg": "response XML too large: 12052192 bytes"} + 8dadf846488ddc2ff3934dd6beee0e3046fa3800 | 2020-11-24 01:24:02.668692+00 | | 200 | error | | {"error_msg": "response XML too large: 18758248 bytes"} + 227900724e5cf9fbd06146c914239d0c12c3671a | 2020-03-18 10:24:33.394339+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"} + https://web.archive.org/web/20200210041053/https://pdfs.semanticscholar.org/2279/00724e5cf9fbd06146c914239d0c12c3671a.pdf + FIXED + f667b4ef2befb227078169ed57ffc6efc5fa85c2 | 2020-03-20 04:54:18.902756+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 527"} + https://web.archive.org/web/20200218182411/https://pdfs.semanticscholar.org/f667/b4ef2befb227078169ed57ffc6efc5fa85c2.pdf + FIXED + c1e8d9df347b8de53fc2116615b1343ba327040d | 2020-11-08 21:46:04.552442+00 | | 200 | bad-grobid-xml | | {"error_msg": "mismatched tag: line 198, column 3"} + https://web.archive.org/web/20200904163312/https://arxiv.org/pdf/1906.02107v1.pdf + FIXED (and good) + 4d9860a5eeee6bc671c3be859ca78f89669427f0 | 2021-11-04 01:29:13.081596+00 | | 200 | bad-grobid-xml | | {"error_msg": "unclosed token: line 812, column 7"} + https://web.archive.org/web/20211104012833/https://actabalneologica.eu/wp-content/uploads/library/ActaBalneol2021i3.pdf + FIXED + metadata quality mixed, but complex document (?) + 7cfc0739be9c49d94272110a0a748256bdde9be6 | 2021-07-25 17:06:03.919073+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 38, column 440"} + https://web.archive.org/web/20210716124436/https://jsesd.csers-ly.com/index.php/jsesd/article/download/28/23 + FIXED + 088c61a229084d13f85524efcc9f38a80dd19caf | 2021-09-01 08:08:18.531533+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 47, column 814"} + https://web.archive.org/web/20210814181328/https://wmrj.areeo.ac.ir/article_120843_3806466cb1f5a125c328f99866751a43.pdf + FIXED + 19e70297e523e9f32cd4379af33a12ab95c34a71 | 2021-11-05 10:09:25.407657+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 853, column 84"} + not found + acc855d74431537b98de5185e065e4eacbab7b26 | 2021-11-12 22:57:22.439007+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 60, column 45"} + https://web.archive.org/web/20211111182756/https://arxiv.org/pdf/2006.13365v5.pdf + BROKEN: not well-formed (invalid token): line 60, column 45 + <note type="raw_affiliation"><label>&</label> Fraunhofer IAIS, Sankt Augustin and Dresden, Germany.</note> + 8e73055c63d1e684b59059ac418f55690a2eec01 | 2021-11-12 17:34:46.343685+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 44, column 45"} + not found + c2b3f696e97b9e80f38c35aa282416e95d6d9f5e | 2021-11-12 22:57:12.417191+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 58, column 45"} + https://web.archive.org/web/20211112051714/https://ccsenet.org/journal/index.php/gjhs/article/download/0/0/46244/49308 + BROKEN: not well-formed (invalid token): line 58, column 45 + <note type="raw_affiliation"><label>&</label> Ren, 2020; Meng, Hua, & Bian, 2020).</note> + 840d4609308c4a7748393181fe1f6a45f9d425c5 | 2021-11-12 22:57:17.433022+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 1824, column 45"} + not found + 3deb6375e894c5007207502bf52d751a47a20725 | 2021-11-12 23:11:17.711948+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 65, column 45"} + not found + f1d06080a4b1ac72ab75226e692e8737667c29a7 | 2020-01-16 09:23:27.579995+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 29, column 1581"} + https://web.archive.org/web/20180721030918/https://journals.squ.edu.om/index.php/jams/article/download/650/649 + FIXED, good + f3e7b91fce9132addc59bd1560c5eb16c0330842 | 2020-01-12 11:58:06.654613+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"} + https://web.archive.org/web/20180426020051/http://jhsw.tums.ac.ir/article-1-5121-en.pdf + FIXED + 37edcaa6f67fbb8c3e27fa02da4f0fa780e33bca | 2020-01-04 21:53:49.578847+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 1284"} + https://web.archive.org/web/20180510115632/http://www.fmreview.org/sites/fmr/files/FMRdownloads/ar/detention/majidi.pdf + FIXED + 3f1d302143824808f7109032687a327708896748 | 2020-01-05 20:51:18.783034+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"} + https://web.archive.org/web/20180428082655/http://jhsw.tums.ac.ir/browse.php?a_id=5121&sid=1&slc_lang=fa&ftxt=1 + FIXED + (21 rows) + +Some other errors from other queries: + + d9634f194bc3dee27db7a1cb49b30e48803d7ad8 | 2020-01-06 16:01:09.331272+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/VyuJWqREHT.lxml"} + https://web.archive.org/web/20190304092121/http://pdfs.semanticscholar.org/d963/4f194bc3dee27db7a1cb49b30e48803d7ad8.pdf + FIXED: with 0.7.0+ + + 56c9b5398ef94df54d699342740956caf4523925 | 2020-02-06 21:37:42.139761+00 | | 500 | error | | {"error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1"} + https://web.archive.org/web/20080907000756/http://www.rpi.edu/~limc/poster_ding.pdf + still errors: "error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1", "status": "error", "status_code": 500 + BAD PDF ("no pages" in evince) + + d7cf65ed211cf1e3420c595fdbecc5d18f297b11 | 2020-01-10 23:19:16.783415+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/dBV73X4HrZ.lxml"} + https://web.archive.org/web/20170812074846/http://dspace.utpl.edu.ec/bitstream/123456789/7918/1/Tesis_de_Jacome_Valdivieso_Soraya_Stephan%c3%ada.pdf + FIXED + + 51d070ab398a8744286ef7356445f0828a9f3abb | 2020-02-06 16:01:23.98892+00 | | 503 | error | | {"error_msg": "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n<t + https://web.archive.org/web/20191113160818/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC2082155&blobtype=pdf + FIXED + +In summary, there are still a small number of `bad-grobid-xml` cases, and still +many "very large PDF" cases. But we should probably broadly retry everything, +especially the 503 errors (from when GROBID is simply down/unavailable). + +The `bad-grobid-xml` cases here were all from "<label>" in raw affiliations, +which I have submitted a patch/PR for. diff --git a/notes/examples/dataset_examples.txt b/notes/examples/dataset_examples.txt new file mode 100644 index 0000000..3a04750 --- /dev/null +++ b/notes/examples/dataset_examples.txt @@ -0,0 +1,52 @@ + +### ArchiveOrg: CAT dataset + +<https://archive.org/details/CAT_DATASET> + +`release_36vy7s5gtba67fmyxlmijpsaui` + +### + +<https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635> + +doi:10.1371/journal.pone.0120448 + +Single .rar file + +### Dataverse + +<https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B> + +Single excel file + +### Dataverse + +<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1> + +doi:10.7910/DVN/CLSFKX + +Mulitple files; multiple versions? + +API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1> + + .data.id + .data.latestVersion.datasetPersistentId + .data.latestVersion.versionNumber, .versionMinorNumber + .data.latestVersion.files[] + .dataFile + .contentType (mimetype) + .filename + .filesize (int, bytes) + .md5 + .persistendId + .description + .label (filename?) + .version + +Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> + +Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3) + +Dataverse refs: +- 'doi' and 'hdl' are the two persistentId styles +- file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled diff --git a/notes/examples/html_test_journals.txt b/notes/examples/html_test_journals.txt new file mode 100644 index 0000000..540dc9f --- /dev/null +++ b/notes/examples/html_test_journals.txt @@ -0,0 +1,153 @@ + +Good examples of journals to run HTML fulltext extraction on. + +## Live Web + +d-lib magazine + live web + no longer active + http://www.dlib.org/back.html + +NLM technical bulletin + https://www.nlm.nih.gov/pubs/techbull/back_issues.html + +Genders + https://web.archive.org/web/20141227010240/http://www.genders.org:80/index.html + +firstmondays + live web; now OJS + +outhistory.org + +http://journal.sjdm.org/ + +http://whoosh.org/ + + +## Vanished (but wayback coverage) + +ohmylittledata + issn:2551-1289 + vanished + blog format + http://web.archive.org/web/20180421061156/https://ohmylittledata.com/ + +exquisit corpse + https://web.archive.org/web/20080521052400/http://corpse.org:80/ + +Journal of Mundane Behavior + https://fatcat.wiki/container/tjwfvrjlunf25ofegccgjjmvya + ISSN: 1529-3041 + + defunct since ~2010 + simple HTML articles + references + http://web.archive.org/web/20100406162007/http:/mundanebehavior.org/index2.htm + http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm + +War Crimes + + PDF articles (not HTML) + http://web.archive.org/web/20120916035741/http:/www.war-crimes.org/ + + +## DOAJ Test Articles (HTML) + + zcat doaj_article_data_2020-08-07.json.gz | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.txt + => 2,184,954 + + cut -f3 -d/ html_fulltext_urls.txt | sort | uniq -c | sort -nr | head -n25 + 254817 link.springer.com + 145159 www.scielo.br + 78044 journal.frontiersin.org + 77394 www.frontiersin.org + 40849 www.dovepress.com + 19024 dergipark.org.tr + 18758 periodicos.ufsc.br + 16346 www.revistas.usp.br + 15872 revistas.unal.edu.co + 15527 revistas.ucm.es + 13669 revistas.usal.es + 12640 dergipark.gov.tr + 12111 journals.rudn.ru + 11839 www.scielosp.org + 11277 www.karger.com + 10827 www.journals.vu.lt + 10318 + 9854 peerj.com + 9100 ojs.unud.ac.id + 8581 jurnal.ugm.ac.id + 8261 riviste.unimi.it + 8012 journals.uran.ua + 7454 revistas.pucp.edu.pe + 7264 journals.vgtu.lt + 7200 publicaciones.banrepcultural.org + + cat html_fulltext_urls.txt \ + | rg -v link.springer.com \ + | rg -v scielo \ + | rg -v dergipark.gov.tr \ + | rg -v frontiersin.org \ + > html_fulltext_urls.filtered.txt + => 1,579,257 + + zcat doaj_article_data_2020-08-07.json.gz | rg -v '"doi"' | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.no_doi.txt + => 560k + + cut -f3 -d/ html_fulltext_urls.no_doi.txt | sort | uniq -c | sort -nr | head -n25 + 40849 www.dovepress.com + 10570 journals.rudn.ru + 10494 dergipark.org.tr + 10233 revistas.unal.edu.co + 9981 dergipark.gov.tr + 9428 revistas.usal.es + 8292 revistas.ucm.es + 7200 publicaciones.banrepcultural.org + 6953 revistas.pucp.edu.pe + 6000 www.scielosp.org + 5962 www.scielo.br + 5621 www.richtmann.org + 5123 scielo.sld.cu + 5067 ojs.unud.ac.id + 4838 periodicos.ufsc.br + 4736 revistasonlinepre.inap.es + 4486 journal.fi + 4221 www.seer.ufu.br + 3553 revistas.uam.es + 3492 revistas.pucsp.br + 3060 www.scielo.org.co + 2991 scielo.isciii.es + 2802 seer.ufrgs.br + 2692 revistas.unc.edu.ar + 2685 srl.si + + cat html_fulltext_urls.no_doi.txt \ + | rg -v link.springer.com \ + | rg -v scielo \ + | rg -v dergipark.gov.tr \ + | rg -v frontiersin.org \ + > html_fulltext_urls.no_doi.filtered.txt + => 518,608 + + zcat doaj_articles_2020-08-07.html_fulltext_urls.no_doi.filtered.txt.gz | shuf -n20 + https://revistas.unc.edu.ar/index.php/revistaEF/article/view/22795 + https://journal.umy.ac.id/index.php/st/article/view/3297 + https://www.unav.edu/publicaciones/revistas/index.php/estudios-sobre-educacion/article/view/23442 + http://publications.muet.edu.pk/research_papers/pdf/pdf1615.pdf + http://revistas.uncu.edu.ar/ojs/index.php/revistaestudiosclasicos/article/view/1440 + https://journal.fi/inf/article/view/59430 + http://journal.uii.ac.id/index.php/Eksakta/article/view/2429 + https://www.dovepress.com/infant-sleep-and-its-relation-with-cognition-and-growth-a-narrative-re-peer-reviewed-article-NSS + https://revistasonlinepre.inap.es/index.php/REALA/article/view/9157 + http://dergipark.org.tr/dubited/issue/27453/299047?publisher=duzce + http://revistas.pucp.edu.pe/index.php/themis/article/view/11862 + http://journal.bdfish.org/index.php/fisheries/article/view/91 + https://ojs.unud.ac.id/index.php/buletinfisika/article/view/30567 + https://www.lithosphere.ru/jour/article/view/779 + https://journals.hioa.no/index.php/seminar/article/view/2412 + http://revistas.unicauca.edu.co/index.php/rfcs/article/view/197 + https://www.kmuj.kmu.edu.pk/article/view/15698 + http://forodeeducacion.com/ojs/index.php/fde/article/view/82 + https://revistas.unc.edu.ar/index.php/ConCienciaSocial/article/view/19941 + http://grbs.library.duke.edu/article/view/3361 + diff --git a/notes/examples/random_datasets.md b/notes/examples/random_datasets.md new file mode 100644 index 0000000..b69132c --- /dev/null +++ b/notes/examples/random_datasets.md @@ -0,0 +1,19 @@ + +Possible external datasets to ingest (which are not entire platforms): + +- https://research.google/tools/datasets/ +- https://openslr.org/index.html +- https://www.kaggle.com/datasets?sort=votes&tasks=true +- https://archive.ics.uci.edu/ml/datasets.php + +Existing archive.org datasets to ingest: + +- https://archive.org/details/allthemusicllc-datasets + +Papers on archive.org to ingest: + +- <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=> +- <https://archive.org/details/biorxiv> +- <https://archive.org/details/philosophicaltransactions?tab=collection> +- <https://archive.org/search.php?query=doi%3A%2A> +- <https://archive.org/details/folkscanomy_academic> diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md index fded7b3..ac808dd 100644 --- a/notes/ingest/2021-09-02_oai_pmh_patch.md +++ b/notes/ingest/2021-09-02_oai_pmh_patch.md @@ -1506,8 +1506,8 @@ possible to detect these at ingest time, or earlier at OAI-PMH harvest/transform time and filter them out. It may be worthwhile to attempt ingest of multiple existing captures -(timestamps) in the ingest pipeline. Eg, isntead of chosing a single "best" -capture, if therea are multiple HTTP 200 status captures, try ingest with each +(timestamps) in the ingest pipeline. Eg, instead of chosing a single "best" +capture, if there are multiple HTTP 200 status captures, try ingest with each (or at least a couple). This is because repository software gets upgraded, so old "no-capture" or "not found" or "link loop" type captures may work when recrawled. diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md new file mode 100644 index 0000000..d2a8d71 --- /dev/null +++ b/notes/ingest/2022-03_oaipmh.md @@ -0,0 +1,40 @@ + +Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl. + +Note that Martin excluded many Indonesian endpoints, will need to follow-up on +those. + +## Prep + +Fetch metadata snapshot: + + wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst + + wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst + +Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large): + + zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \ + | rg -v 'oai:kb.dk:' \ + | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \ + | rg -v 'oai:hispana.mcu.es:' \ + | rg -v 'oai:bnf.fr:' \ + | rg -v 'oai:ukm.si:' \ + | rg -v 'oai:biodiversitylibrary.org:' \ + | rg -v 'oai:hsp.org:' \ + | rg -v 'oai:repec:' \ + | rg -v 'oai:n/a:' \ + | rg -v 'oai:quod.lib.umich.edu:' \ + | rg -v 'oai:americanae.aecid.es:' \ + | rg -v 'oai:www.irgrid.ac.cn:' \ + | rg -v 'oai:espace.library.uq.edu:' \ + | rg -v 'oai:edoc.mpg.de:' \ + | rg -v 'oai:bibliotecadigital.jcyl.es:' \ + | rg -v 'oai:repository.erciyes.edu.tr:' \ + | rg -v 'oai:krm.or.kr:' \ + | ./scripts/oai2ingestrequest.py - \ + | pv -l \ + | gzip \ + > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz + +These failed to transform in the expected way; a change in JSON schema from last time? diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md new file mode 100644 index 0000000..74aeb8d --- /dev/null +++ b/notes/ingest/2022-07-19_dblp.md @@ -0,0 +1,50 @@ + +Cross-posting from fatcat bulk metadata update/ingest. + + zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 631k 0:00:11 [54.0k/s] + + +## Post-Crawl Stats + +This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run +2022-09-06: + + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'dblp' + GROUP BY ingest_request.ingest_type, status + -- ORDER BY ingest_request.ingest_type, COUNT DESC + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+-----------------------+-------- + pdf | success | 305142 + pdf | no-pdf-link | 192683 + pdf | no-capture | 42634 + pdf | terminal-bad-status | 38041 + pdf | skip-url-blocklist | 31055 + pdf | link-loop | 9263 + pdf | wrong-mimetype | 4545 + pdf | redirect-loop | 3952 + pdf | empty-blob | 2705 + pdf | wayback-content-error | 834 + pdf | wayback-error | 294 + pdf | petabox-error | 202 + pdf | blocked-cookie | 155 + pdf | cdx-error | 115 + pdf | body-too-large | 66 + pdf | bad-redirect | 19 + pdf | timeout | 7 + pdf | bad-gzip-encoding | 4 + (18 rows) + +That is quite a lot of `no-pdf-link`, might be worth doing a random sample +and/or re-ingest. And a chunk of `no-capture` to retry. diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md new file mode 100644 index 0000000..7e55633 --- /dev/null +++ b/notes/ingest/2022-07_doaj.md @@ -0,0 +1,199 @@ + +This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for +heritrix bulk crawling, along with JALC and DOAJ URLs. + + export SNAPSHOT=2022-07-20 + +## Transform and Load + + # on sandcrawler-vm + mkdir -p /srv/sandcrawler/tasks/doaj + cd /srv/sandcrawler/tasks/doaj + wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz" + + # in pipenv, in python directory + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz + # 9.72M 0:36:28 [4.44k/s] + + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request - + # 9.72M 0:17:04 [9.49k/s] + # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0}) + # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097}) + +Stats after this load: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3165539 + pdf | | 2078874 + html | | 1547698 + html | wrong-scope | 1114332 + pdf | no-pdf-link | 517261 + html | success | 388376 + html | unknown-scope | 242044 + pdf | no-capture | 179030 + pdf | terminal-bad-status | 174741 + html | no-capture | 155323 + pdf | null-body | 129267 + pdf | redirect-loop | 127136 + html | html-resource-no-capture | 117275 + html | null-body | 100296 + pdf | blocked-cookie | 71093 + html | redirect-loop | 65519 + html | terminal-bad-status | 64856 + html | blocked-cookie | 64095 + html | spn2-backoff | 55173 + pdf | link-loop | 27440 + html | wrong-mimetype | 26016 + html | wayback-content-error | 20109 + xml | | 13624 + pdf | wrong-mimetype | 8411 + xml | success | 6899 + html | petabox-error | 6199 + html | wayback-error | 5269 + html | spn2-cdx-lookup-failure | 4635 + html | spn2-recent-capture | 4527 + xml | null-body | 2353 + (30 rows) + +## Bulk Ingest + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json'; + # COPY 3962331 + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json + # 3.96M 0:01:47 [36.7k/s] + +Top domains: + + cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20 + 789988 www.mdpi.com + 318142 www.frontiersin.org + 226316 link.springer.com + 204429 www.scielo.br + 201175 www.sciencedirect.com + 72852 ieeexplore.ieee.org + 68983 dx.doi.org + 33286 www.dovepress.com + 26020 elifesciences.org + 23838 www.cetjournal.it + 21102 mab-online.nl + 20242 www.revistas.usp.br + 16564 periodicos.uem.br + 15710 journals.openedition.org + 14514 dergipark.org.tr + 14072 apcz.umk.pl + 13924 ojs.minions.amsterdam + 13717 bmgn-lchr.nl + 13512 ojstest.minions.amsterdam + 10440 journals.asm.org + +Bulk ingest: + + cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # Done + +## Stats Again + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- ORDER BY ingest_request.ingest_type, COUNT DESC + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 4704006 + html | wrong-scope | 1761227 + html | success | 778165 + pdf | no-pdf-link | 759805 + html | no-capture | 382080 + html | unknown-scope | 313391 + html | html-resource-no-capture | 292953 + pdf | no-capture | 290311 + pdf | terminal-bad-status | 271776 + pdf | null-body | 129267 + pdf | blocked-cookie | 108491 + html | terminal-bad-status | 103014 + html | null-body | 100296 + html | blocked-cookie | 88533 + pdf | | 81517 + pdf | skip-url-blocklist | 76443 + html | spn2-backoff | 50615 + pdf | link-loop | 45516 + html | wrong-mimetype | 33525 + html | wayback-content-error | 25535 + pdf | empty-blob | 21431 + pdf | redirect-loop | 19795 + html | petabox-error | 18291 + html | empty-blob | 14391 + pdf | wrong-mimetype | 14084 + html | redirect-loop | 12856 + xml | success | 10381 + xml | no-capture | 10008 + html | skip-url-blocklist | 3294 + html | cdx-error | 3275 + (30 rows) + +Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k +PDFs with no attempt at all? Maybe a filter, or bogus URLs. + +Over 1.5M new PDF success over this crawl iteration period, nice. diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md new file mode 100644 index 0000000..415f23b --- /dev/null +++ b/notes/ingest/2022-07_targeted.md @@ -0,0 +1,140 @@ + +Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs. + + export PATCHDATE=2022-07-29 + export CRAWLVM=wbgrp-svc279.us.archive.org + export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07 + +## Seedlist Query + +Terminal URLs dump: + + COPY ( + SELECT row_to_json(t) FROM ( + SELECT ingest_file_result.terminal_url, ingest_request.* + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + -- AND ingest_file_result.updated >= '2022-01-12' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status LIKE 'spn2-%' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + OR ingest_file_result.terminal_status_code = 429 + ) + ) + ) + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'dblp' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'pmc' + -- OR ingest_request.link_source = 'unpaywall' + -- OR ingest_request.link_source = 'oai' + ) + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + ) t + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json'; + => COPY 3524573 + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \ + | rg -v "\\\\" \ + | jq -r .terminal_url \ + | rg '://' \ + | rg -i '^http' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt + => 3.11M 0:01:08 [45.4k/s] + + # check top domains + cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25 + 624948 doi.org + 382492 www.jstage.jst.go.jp + 275087 www.mdpi.com + 157134 www.persee.fr + 108979 www.sciencedirect.com + 94375 www.scielo.br + 50834 onlinelibrary.wiley.com + 49991 journals.lww.com + 30354 www.frontiersin.org + 27963 doaj.org + 27058 www.e-periodica.ch + 24147 dl.acm.org + 23389 aclanthology.org + 22086 www.research-collection.ethz.ch + 21589 medien.die-bonn.de + 18866 www.ingentaconnect.com + 18583 doi.nrct.go.th + 18271 repositories.lib.utexas.edu + 17634 hdl.handle.net + 16366 archives.datapages.com + 15146 cgscholar.com + 13987 dl.gi.de + 13188 www.degruyter.com + 12503 ethos.bl.uk + 12304 preprints.jmir.org + + cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule + => done + + scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp + ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/ + + +## Re-Ingest + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json + => 3.52M 0:01:37 [36.2k/s] + +Ingest: + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md new file mode 100644 index 0000000..ac7c68f --- /dev/null +++ b/notes/ingest/2022-09_oaipmh.md @@ -0,0 +1,397 @@ + +Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921> + +I updated the transform script to block some additional domains. + + +## Prep + +Fetch the snapshot: + + cd /srv/sandcrawler/tasks/ + wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst + +Transform to ingest requests: + + cd /srv/sandcrawler/src/python + git log | head -n1 + # commit dfd4605d84712eccb95a63e50b0bcb343642b433 + + pipenv shell + zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \ + | ./scripts/oai2ingestrequest.py - \ + | pv -l \ + | gzip \ + > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz + # 16.1M 1:01:02 [4.38k/s] + +Curious about types, though this would probably be handled at fatcat ingest +time: + + zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt + + head oai_type_counts.txt -n30 + 5623867 info:eu-repo/semantics/article + 5334928 info:eu-repo/semantics/publishedVersion + 3870359 text + 1240225 Text + 829169 Article + 769849 NonPeerReviewed + 665700 PeerReviewed + 648740 Peer-reviewed Article + 547857 article + 482906 info:eu-repo/semantics/bachelorThesis + 353814 Thesis + 329269 Student thesis + 262650 info:eu-repo/semantics/conferenceObject + 185354 Journal articles + 162021 info:eu-repo/semantics/doctoralThesis + 152079 Journal Article + 150226 Research Article + 130217 Conference papers + 127255 ArtÃculo revisado por pares + 124243 Newspaper + 123908 ##rt.metadata.pkp.peerReviewed## + 123309 Photograph + 122981 info:eu-repo/semantics/masterThesis + 116719 Book + 108946 Image + 108216 Report + 107946 Other + 103562 masterThesis + 103038 info:eu-repo/semantics/other + 101404 StillImage + [...] + +And formats: + + zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt + + head -n 20 oai_format_counts.txt + 11151928 application/pdf + 677413 text + 561656 text/html + 498518 image/jpeg + 231219 Text + 193638 text/xml + 147214 Image + 117073 image/jpg + 110872 pdf + 91323 image/tiff + 76948 bib + 75393 application/xml + 70244 Digitized from 35 mm. microfilm. + 68206 mods + 59227 PDF + 57677 application/epub+zip + 57602 application/octet-stream + 52072 text/plain + 51620 application/msword + 47227 audio/mpeg + +Also, just overall size (number of records): + + zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l + # 20,840,301 + +Next load in to sandcrawler DB: + + zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request - + + Traceback (most recent call last): + File "./persist_tool.py", line 311, in <module> + main() + File "./persist_tool.py", line 307, in main + args.func(args) + File "./persist_tool.py", line 119, in run_ingest_request + pusher.run() + File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run + self.worker.push_batch(batch) + File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch + resp = self.db.insert_ingest_request(self.cur, irequests) + File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request + resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) + File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values + cur.execute(b''.join(parts)) + psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx" + DETAIL: Index row references tuple (6893121,3) in relation "ingest_request". + HINT: Values larger than 1/3 of a buffer page cannot be indexed. + Consider a function index of an MD5 hash of the value, or use full text indexing. + 15.7M 0:41:48 [6.27k/s] + +Darn, this means we won't get reasonable stats about how many rows were +inserted/updated. + +Patched the persist tool to skip very long URLs, and ran again (backwards, just +URLs which didn't get inserted already): + + zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \ + | tac \ + | head -n1000000 \ + | pv -l \ + | ./persist_tool.py ingest-request - + # 1.00M 0:03:04 [5.41k/s] + # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0}) + +Status of just the new lines: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+--------- + | 6398455 + success | 540219 + no-pdf-link | 41316 + link-loop | 23871 + no-capture | 11350 + redirect-loop | 8315 + wrong-mimetype | 2394 + terminal-bad-status | 1540 + null-body | 1038 + cdx-error | 272 + empty-blob | 237 + petabox-error | 213 + wayback-error | 186 + blocked-cookie | 107 + timeout | 47 + wayback-content-error | 26 + spn2-cdx-lookup-failure | 21 + skip-url-blocklist | 16 + spn2-backoff | 15 + body-too-large | 13 + (20 rows) + + +## Bulk Ingest + +Should already have filtered domains/prefixes in transform script, so not +including filters here. + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + AND ingest_file_result.status IS NULL + ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json'; + # COPY 6398455 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \ + | pv -l \ + | shuf \ + > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json + # 6.40M 0:02:18 [46.2k/s] + + cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # DONE + +Expect this ingest to take a week or so. + +Then, run stats again: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + no-capture | 3617175 + success | 2775036 + no-pdf-link | 449298 + link-loop | 74260 + terminal-bad-status | 47819 + wrong-mimetype | 20195 + redirect-loop | 18197 + empty-blob | 12127 + cdx-error | 3038 + skip-url-blocklist | 2630 + wayback-error | 2599 + petabox-error | 2354 + wayback-content-error | 1617 + blocked-cookie | 1293 + null-body | 1038 + body-too-large | 670 + | 143 + bad-gzip-encoding | 64 + timeout | 47 + spn2-cdx-lookup-failure | 20 + (20 rows) + + +## Crawl Seedlist + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'redirect-loop' + OR ingest_file_result.status = 'terminal-bad-status' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'timeout' + OR ingest_file_result.status = 'wayback-content-error' + ) + ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json'; + => COPY 3692846 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ + | pv -l \ + | shuf \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json + => 3.69M 0:01:19 [46.6k/s] + +This will be used for re-ingest later. For now, extract URLs: + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ + | jq .base_url -r \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt + => 3.66M 0:00:59 [61.8k/s] + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ + | rg '"terminal_url"' \ + | jq -r .result.terminal_url \ + | rg -v ^null$ \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt + => 0.00 0:00:05 [0.00 /s] + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \ + | awk '{print "F+ " $1}' \ + | shuf \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + +What domains are we crawling? + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \ + | sort -u -S 4G \ + | cut -d/ -f3 \ + | sort \ + | uniq -c \ + | sort -nr \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt + + head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt + 91899 raco.cat + 70116 islandora.wrlc.org + 68708 urn.kb.se + 63726 citeseerx.ist.psu.edu + 50370 publications.rwth-aachen.de + 44885 urn.nsk.hr + 38429 server15795.contentdm.oclc.org + 33041 periodicos.ufpb.br + 32519 nbn-resolving.org + 31990 www.ajol.info + 24745 hal.archives-ouvertes.fr + 22569 id.nii.ac.jp + 17239 tilburguniversity.on.worldcat.org + 15873 dspace.nbuv.gov.ua + 15436 digitalcommons.wustl.edu + 14885 www.iiste.org + 14623 www.manchester.ac.uk + 14033 nbn-resolving.de + 13999 opus4.kobv.de + 13689 www.redalyc.org + +Sizes: + + wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + + 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt + 0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt + 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + + +Copy seedlist to crawler: + + # as regular user + scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp + +## Post-Crawl Bulk Ingest + + # ran 2022-11-16, after crawl cleanup + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -----------------------+--------- + success | 4721164 +1,946,128 + no-pdf-link | 1116290 + no-capture | 673939 + terminal-bad-status | 232217 + link-loop | 148544 + wrong-mimetype | 68841 + redirect-loop | 26262 + empty-blob | 17759 + cdx-error | 6570 + blocked-cookie | 4026 + blocked-wall | 3054 + skip-url-blocklist | 2924 + body-too-large | 2404 + bad-redirect | 1565 + wayback-error | 1320 + petabox-error | 1083 + null-body | 1038 + wayback-content-error | 264 + bad-gzip-encoding | 150 + | 143 + (20 rows) + diff --git a/notes/ingest_domains.txt b/notes/ingest_domains.txt new file mode 100644 index 0000000..ae06272 --- /dev/null +++ b/notes/ingest_domains.txt @@ -0,0 +1,294 @@ + +## Queries to find broken domains + +Top domains with failed ingests: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + +Status overview for a particular domain: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'osapublishing.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT domain, terminal_status_code, COUNT((domain, terminal_status_code)) + FROM (SELECT terminal_status_code, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'osapublishing.org' + AND t1.terminal_status_code is not null + GROUP BY domain, terminal_status_code + ORDER BY COUNT DESC; + +Sample recent failures: + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%osapublishing.org%' + AND status = 'terminal-bad-status' + ORDER BY updated DESC + LIMIT 10; + + +## Failing + +www.osapublishing.org + + this publisher (The Optical Society) is systemically using a CAPTCHA to + gate access to PDFs. bummer! could ask them to white-list? + + has citation_pdf_url, so that isn't an issue + + status: "no-pdf-link" + hops: + "https://doi.org/10.1364/optica.6.000798", + "https://www.osapublishing.org/viewmedia.cfm?uri=optica-6-6-798&seq=0" + "https://www.osapublishing.org/captcha/?guid=830CEAB5-09BD-6140-EABD-751200C78B1C" + + domain | status | count + -----------------------+---------------------+------- + www.osapublishing.org | no-capture | 16680 + www.osapublishing.org | no-pdf-link | 373 + www.osapublishing.org | redirect-loop | 19 + www.osapublishing.org | terminal-bad-status | 5 + www.osapublishing.org | cdx-error | 1 + www.osapublishing.org | wrong-mimetype | 1 + www.osapublishing.org | spn-error | 1 + www.osapublishing.org | success | 1 + www.osapublishing.org | wayback-error | 1 + (9 rows) + +www.persee.fr + + Seems to be mostly blocking or rate-limiting? + + domain | status | count + ---------------+-------------------------------------+------- + www.persee.fr | no-capture | 37862 + www.persee.fr | terminal-bad-status | 3134 + www.persee.fr | gateway-timeout | 2828 + www.persee.fr | no-pdf-link | 431 + www.persee.fr | spn-error | 75 + www.persee.fr | redirect-loop | 23 + www.persee.fr | success | 8 + www.persee.fr | spn2-error | 2 + www.persee.fr | spn2-error:soft-time-limit-exceeded | 1 + www.persee.fr | wrong-mimetype | 1 + (10 rows) + +journals.openedition.org + + PDF access is via "freemium" subscription. Get redirects to: + + https://auth.openedition.org/authorized_ip?url=http%3A%2F%2Fjournals.openedition.org%2Fnuevomundo%2Fpdf%2F61053 + + Content is technically open access (HTML and license; for all content?), + but can't be crawled as PDF without subscription. + + domain | status | count + --------------------------+-------------------------+------- + journals.openedition.org | redirect-loop | 29587 + journals.openedition.org | success | 6821 + journals.openedition.org | no-pdf-link | 1507 + journals.openedition.org | no-capture | 412 + journals.openedition.org | wayback-error | 32 + journals.openedition.org | wrong-mimetype | 27 + journals.openedition.org | terminal-bad-status | 13 + journals.openedition.org | spn2-cdx-lookup-failure | 4 + journals.openedition.org | spn-remote-error | 1 + journals.openedition.org | null-body | 1 + journals.openedition.org | cdx-error | 1 + (11 rows) + +journals.lww.com + + no-pdf-link + + domain | status | count + ------------------+----------------+------- + journals.lww.com | no-pdf-link | 11668 + journals.lww.com | wrong-mimetype | 131 + (2 rows) + + doi prefix: 10.1097 + + <meta name="wkhealth_pdf_url" content="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf" /> + data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw==" + + Some weird thing going on, maybe they are blocking-via-redirect based on + our User-Agent? Seems like wget works, so funny that they don't block that. + +musewide.aip.de + + no-pdf-link + +koreascience.or.kr | no-pdf-link | 8867 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'osapublishing.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%osapublishing.org%' + AND status = 'terminal-bad-status' + ORDER BY updated DESC + LIMIT 10; + +www.cairn.info | link-loop | 8717 + +easy.dans.knaw.nl | no-pdf-link | 8262 +scielo.conicyt.cl | no-pdf-link | 7925 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'scielo.conicyt.cl' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%scielo.conicyt.cl%' + AND status = 'terminal-bad-status' + ORDER BY updated DESC + LIMIT 10; + + + domain | status | count + -------------------+---------------------+------- + scielo.conicyt.cl | no-pdf-link | 7926 + scielo.conicyt.cl | success | 4972 + scielo.conicyt.cl | terminal-bad-status | 1474 + scielo.conicyt.cl | wrong-mimetype | 6 + scielo.conicyt.cl | no-capture | 4 + scielo.conicyt.cl | null-body | 1 + + + pdf | https://doi.org/10.4067/s0370-41061980000300002 | 2020-02-22 23:55:56.235822+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0370-41061980000300002&lng=en&nrm=iso&tlng=en | 20200212201727 | 200 | + pdf | https://doi.org/10.4067/s0718-221x2019005000201 | 2020-02-22 23:01:49.070104+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-221X2019005000201&lng=en&nrm=iso&tlng=en | 20200214105308 | 200 | + pdf | https://doi.org/10.4067/s0717-75262011000200002 | 2020-02-22 22:49:36.429717+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-75262011000200002&lng=en&nrm=iso&tlng=en | 20200211205804 | 200 | + pdf | https://doi.org/10.4067/s0717-95022006000400029 | 2020-02-22 22:33:07.761766+00 | f | terminal-bad-status | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0717-95022006000400029&lng=en&nrm=iso&tlng=en | 20200209044048 | 200 | + + These seem, on retry, like success? Maybe previous was a matter of warc/revisit not getting handled correctly? + + pdf | https://doi.org/10.4067/s0250-71611998007100009 | 2020-02-22 23:57:16.481703+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0250-71611998007100009&lng=en&nrm=iso&tlng=en | 20200212122939 | 200 | + pdf | https://doi.org/10.4067/s0716-27902005020300006 | 2020-02-22 23:56:01.247616+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0716-27902005020300006&lng=en&nrm=iso&tlng=en | 20200214192151 | 200 | + pdf | https://doi.org/10.4067/s0718-23762005000100015 | 2020-02-22 23:53:55.81526+00 | f | no-pdf-link | https://scielo.conicyt.cl/scielo.php?script=sci_arttext&pid=S0718-23762005000100015&lng=en&nrm=iso&tlng=en | 20200214173237 | 200 | + + Look like web/xml only. + + TODO: XML ingest (and replay?) support. These are as "<article>", not sure if that is JATS or what. + +www.kci.go.kr | no-pdf-link | 6842 +www.m-hikari.com | no-pdf-link | 6763 +cshprotocols.cshlp.org | no-pdf-link | 6553 +www.bibliotekevirtual.org | no-pdf-link | 6309 +data.hpc.imperial.ac.uk | no-pdf-link | 6071 +projecteuclid.org | link-loop | 5970 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'projecteuclid.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%projecteuclid.org%' + AND status = 'link-loop' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + -------------------+-------------------------+------- + projecteuclid.org | link-loop | 5985 + projecteuclid.org | success | 26 + projecteuclid.org | wayback-error | 26 + projecteuclid.org | wrong-mimetype | 17 + projecteuclid.org | spn2-cdx-lookup-failure | 4 + projecteuclid.org | other-mimetype | 4 + projecteuclid.org | no-capture | 3 + projecteuclid.org | terminal-bad-status | 2 + projecteuclid.org | spn2-error:job-failed | 1 + projecteuclid.org | spn-remote-error | 1 + (10 rows) + + Doing a cookie check and redirect. + + TODO: brozzler behavior to "click the link" instead? + +www.scielo.br | no-pdf-link | 5823 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'www.scielo.br' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%www.scielo.br%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + ---------------+-------------------------+------- + www.scielo.br | success | 35150 + www.scielo.br | no-pdf-link | 5839 + www.scielo.br | terminal-bad-status | 429 + www.scielo.br | no-capture | 189 + www.scielo.br | wrong-mimetype | 7 + www.scielo.br | spn2-cdx-lookup-failure | 2 + (6 rows) + + Seems to just be the subset with no PDFs. + +get.iedadata.org | no-pdf-link | 5822 +www.pdcnet.org | no-pdf-link | 5798 +publications.rwth-aachen.de | no-pdf-link | 5323 +www.sciencedomain.org | no-pdf-link | 5231 +medicalforum.ch | terminal-bad-status | 4574 +jrnl.nau.edu.ua | link-loop | 4145 +ojs.academypublisher.com | no-pdf-link | 4017 + +## MAG bulk ingest + +- dialnet.unirioja.es | redirect-loop | 240967 + dialnet.unirioja.es | terminal-bad-status | 20320 + => may be worth re-crawling via heritrix? +- agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639 + => and other *.onlinelibrary.wiley.com +- www.researchgate.net | redirect-loop | 42859 +- www.redalyc.org:9081 | no-pdf-link | 10515 +- www.repository.naturalis.nl | redirect-loop | 8213 +- bjp.rcpsych.org | link-loop | 8045 +- journals.tubitak.gov.tr | wrong-mimetype | 7159 +- www.erudit.org | redirect-loop | 6819 +- papers.ssrn.com | redirect-loop | 27328 + => blocking is pretty aggressive, using cookies or referrer or something. + maybe a brozzler behavior would work, but doesn't currently + +## Out of Scope + +Datasets only? + +- plutof.ut.ee +- www.gbif.org +- doi.pangaea.de +- www.plate-archive.org + +Historical non-paper content: + +- dhz.uni-passau.de (newspapers) +- digital.ucd.ie (irish historical) + +Mostly datasets (some PDF content): + +- *.figshare.com +- zenodo.com +- data.mendeley.com diff --git a/notes/possible_ingest_targets.txt b/notes/possible_ingest_targets.txt new file mode 100644 index 0000000..fcdc3e4 --- /dev/null +++ b/notes/possible_ingest_targets.txt @@ -0,0 +1,15 @@ + +- all releases from small journals, regardless of OA status, if small (eg, less than 200 papers published), and not big5 + +more complex crawling/content: +- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 +- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url +- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data) +- doi.ala.org.au: possible dataset ingest source +- peerj.com, at least reviews, should be HTML ingest? or are some PDF? +- publons.com should be HTML ingest, possibly special case for scope +- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug + +other tasks: +- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512 +- push/deploy sandcrawler changes diff --git a/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md new file mode 100644 index 0000000..74d3857 --- /dev/null +++ b/notes/tasks/2022-11-21_andrzejklimczuk_cleanup.md @@ -0,0 +1,132 @@ + +Had a huge number of SPN requests for the andrzejklimczuk.com domain, +presumably from the author. + +Many were duplicates (same file, multiple releases, often things like zenodo +duplication). Many were also GROBID 500s, due to truncated common crawl +captures. + +Needed to cleanup! Basically sorted through a few editgroups manually, then +rejected all the rest and manually re-submitted with the below queries and +commands: + + SELECT COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%'; + => 589 + + SELECT ingest_file_result.status, COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + GROUP BY ingest_file_result.status; + + status | count + ----------------+------- + cdx-error | 1 + success | 587 + wrong-mimetype | 1 + (3 rows) + + + SELECT grobid.status_code, COUNT(*) from ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + GROUP BY grobid.status_code; + + status_code | count + -------------+------- + 200 | 385 + 500 | 202 + | 2 + (3 rows) + + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + AND ingest_file_result.status = 'success' + AND grobid.status_code = 500 + ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json'; + => COPY 202 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid ON + grobid.sha1hex = ingest_file_result.terminal_sha1hex + WHERE + ingest_request.link_source = 'spn' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url like 'https://andrzejklimczuk.com/%' + AND ingest_file_result.status = 'success' + AND grobid.status_code = 200 + ) TO '/srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json'; + => COPY 385 + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.rows.json \ + > /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ + | jq '. + {force_recrawl: true}' -c \ + > /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ + | shuf \ + | head -n100 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +cat /srv/sandcrawler/tasks/andrzejklimczuk_good_spn.json \ + | shuf \ + | head -n10000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/andrzejklimczuk_bad_spn.rows.json \ + > /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json + +cat /srv/sandcrawler/tasks/andrzejklimczuk_bad2_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 |