aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-12-23 15:24:33 -0800
committerBryan Newbold <bnewbold@archive.org>2022-12-23 15:24:33 -0800
commitc42a7f46d29ca9d6e7c84b1b0617f272f71f25a5 (patch)
treeefe52857722e33473626b74e41a5fcdbcdd30a6c
parent3a7a946a1f271cd0f334129fa3bb51c451b82966 (diff)
downloadsandcrawler-c42a7f46d29ca9d6e7c84b1b0617f272f71f25a5.tar.gz
sandcrawler-c42a7f46d29ca9d6e7c84b1b0617f272f71f25a5.zip
notes: old examples
-rw-r--r--notes/examples/2021-11-12_broken_grobid_xml.md83
-rw-r--r--notes/examples/dataset_examples.txt52
-rw-r--r--notes/examples/html_test_journals.txt153
-rw-r--r--notes/examples/random_datasets.md19
4 files changed, 307 insertions, 0 deletions
diff --git a/notes/examples/2021-11-12_broken_grobid_xml.md b/notes/examples/2021-11-12_broken_grobid_xml.md
new file mode 100644
index 0000000..5223651
--- /dev/null
+++ b/notes/examples/2021-11-12_broken_grobid_xml.md
@@ -0,0 +1,83 @@
+
+Find all the PDFs from web which resulted in `bad-grobid-xml` status code (among others):
+
+ sql> select * from grobid where status != 'success' and status_code != 500 and status_code != 503 and status != 'error-timeout' limit 100;
+
+ sha1hex | updated | grobid_version | status_code | status | fatcat_release | metadata
+ ------------------------------------------+-------------------------------+----------------+-------------+----------------+----------------+------------------------------------------------------------------------
+ d994efeea3b653e2dbe8e13e5a6d203e9b9484ab | 2020-03-20 04:04:40.093094+00 | | 200 | error | | {"error_msg": "response XML too large: 12052192 bytes"}
+ 8dadf846488ddc2ff3934dd6beee0e3046fa3800 | 2020-11-24 01:24:02.668692+00 | | 200 | error | | {"error_msg": "response XML too large: 18758248 bytes"}
+ 227900724e5cf9fbd06146c914239d0c12c3671a | 2020-03-18 10:24:33.394339+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20200210041053/https://pdfs.semanticscholar.org/2279/00724e5cf9fbd06146c914239d0c12c3671a.pdf
+ FIXED
+ f667b4ef2befb227078169ed57ffc6efc5fa85c2 | 2020-03-20 04:54:18.902756+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 527"}
+ https://web.archive.org/web/20200218182411/https://pdfs.semanticscholar.org/f667/b4ef2befb227078169ed57ffc6efc5fa85c2.pdf
+ FIXED
+ c1e8d9df347b8de53fc2116615b1343ba327040d | 2020-11-08 21:46:04.552442+00 | | 200 | bad-grobid-xml | | {"error_msg": "mismatched tag: line 198, column 3"}
+ https://web.archive.org/web/20200904163312/https://arxiv.org/pdf/1906.02107v1.pdf
+ FIXED (and good)
+ 4d9860a5eeee6bc671c3be859ca78f89669427f0 | 2021-11-04 01:29:13.081596+00 | | 200 | bad-grobid-xml | | {"error_msg": "unclosed token: line 812, column 7"}
+ https://web.archive.org/web/20211104012833/https://actabalneologica.eu/wp-content/uploads/library/ActaBalneol2021i3.pdf
+ FIXED
+ metadata quality mixed, but complex document (?)
+ 7cfc0739be9c49d94272110a0a748256bdde9be6 | 2021-07-25 17:06:03.919073+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 38, column 440"}
+ https://web.archive.org/web/20210716124436/https://jsesd.csers-ly.com/index.php/jsesd/article/download/28/23
+ FIXED
+ 088c61a229084d13f85524efcc9f38a80dd19caf | 2021-09-01 08:08:18.531533+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 47, column 814"}
+ https://web.archive.org/web/20210814181328/https://wmrj.areeo.ac.ir/article_120843_3806466cb1f5a125c328f99866751a43.pdf
+ FIXED
+ 19e70297e523e9f32cd4379af33a12ab95c34a71 | 2021-11-05 10:09:25.407657+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 853, column 84"}
+ not found
+ acc855d74431537b98de5185e065e4eacbab7b26 | 2021-11-12 22:57:22.439007+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 60, column 45"}
+ https://web.archive.org/web/20211111182756/https://arxiv.org/pdf/2006.13365v5.pdf
+ BROKEN: not well-formed (invalid token): line 60, column 45
+ <note type="raw_affiliation"><label>&</label> Fraunhofer IAIS, Sankt Augustin and Dresden, Germany.</note>
+ 8e73055c63d1e684b59059ac418f55690a2eec01 | 2021-11-12 17:34:46.343685+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 44, column 45"}
+ not found
+ c2b3f696e97b9e80f38c35aa282416e95d6d9f5e | 2021-11-12 22:57:12.417191+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 58, column 45"}
+ https://web.archive.org/web/20211112051714/https://ccsenet.org/journal/index.php/gjhs/article/download/0/0/46244/49308
+ BROKEN: not well-formed (invalid token): line 58, column 45
+ <note type="raw_affiliation"><label>&</label> Ren, 2020; Meng, Hua, &amp; Bian, 2020).</note>
+ 840d4609308c4a7748393181fe1f6a45f9d425c5 | 2021-11-12 22:57:17.433022+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 1824, column 45"}
+ not found
+ 3deb6375e894c5007207502bf52d751a47a20725 | 2021-11-12 23:11:17.711948+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 65, column 45"}
+ not found
+ f1d06080a4b1ac72ab75226e692e8737667c29a7 | 2020-01-16 09:23:27.579995+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 29, column 1581"}
+ https://web.archive.org/web/20180721030918/https://journals.squ.edu.om/index.php/jams/article/download/650/649
+ FIXED, good
+ f3e7b91fce9132addc59bd1560c5eb16c0330842 | 2020-01-12 11:58:06.654613+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180426020051/http://jhsw.tums.ac.ir/article-1-5121-en.pdf
+ FIXED
+ 37edcaa6f67fbb8c3e27fa02da4f0fa780e33bca | 2020-01-04 21:53:49.578847+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 28, column 1284"}
+ https://web.archive.org/web/20180510115632/http://www.fmreview.org/sites/fmr/files/FMRdownloads/ar/detention/majidi.pdf
+ FIXED
+ 3f1d302143824808f7109032687a327708896748 | 2020-01-05 20:51:18.783034+00 | | 200 | bad-grobid-xml | | {"error_msg": "not well-formed (invalid token): line 40, column 1122"}
+ https://web.archive.org/web/20180428082655/http://jhsw.tums.ac.ir/browse.php?a_id=5121&sid=1&slc_lang=fa&ftxt=1
+ FIXED
+ (21 rows)
+
+Some other errors from other queries:
+
+ d9634f194bc3dee27db7a1cb49b30e48803d7ad8 | 2020-01-06 16:01:09.331272+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/VyuJWqREHT.lxml"}
+ https://web.archive.org/web/20190304092121/http://pdfs.semanticscholar.org/d963/4f194bc3dee27db7a1cb49b30e48803d7ad8.pdf
+ FIXED: with 0.7.0+
+
+ 56c9b5398ef94df54d699342740956caf4523925 | 2020-02-06 21:37:42.139761+00 | | 500 | error | | {"error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1"}
+ https://web.archive.org/web/20080907000756/http://www.rpi.edu/~limc/poster_ding.pdf
+ still errors: "error_msg": "[BAD_INPUT_DATA] PDF to XML conversion failed with error code: 1", "status": "error", "status_code": 500
+ BAD PDF ("no pages" in evince)
+
+ d7cf65ed211cf1e3420c595fdbecc5d18f297b11 | 2020-01-10 23:19:16.783415+00 | | 500 | error | | {"error_msg": "[PARSING_ERROR] Cannot parse file: /run/grobid/tmp/dBV73X4HrZ.lxml"}
+ https://web.archive.org/web/20170812074846/http://dspace.utpl.edu.ec/bitstream/123456789/7918/1/Tesis_de_Jacome_Valdivieso_Soraya_Stephan%c3%ada.pdf
+ FIXED
+
+ 51d070ab398a8744286ef7356445f0828a9f3abb | 2020-02-06 16:01:23.98892+00 | | 503 | error | | {"error_msg": "<html>\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\"/>\n<t
+ https://web.archive.org/web/20191113160818/http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC2082155&blobtype=pdf
+ FIXED
+
+In summary, there are still a small number of `bad-grobid-xml` cases, and still
+many "very large PDF" cases. But we should probably broadly retry everything,
+especially the 503 errors (from when GROBID is simply down/unavailable).
+
+The `bad-grobid-xml` cases here were all from "<label>" in raw affiliations,
+which I have submitted a patch/PR for.
diff --git a/notes/examples/dataset_examples.txt b/notes/examples/dataset_examples.txt
new file mode 100644
index 0000000..3a04750
--- /dev/null
+++ b/notes/examples/dataset_examples.txt
@@ -0,0 +1,52 @@
+
+### ArchiveOrg: CAT dataset
+
+<https://archive.org/details/CAT_DATASET>
+
+`release_36vy7s5gtba67fmyxlmijpsaui`
+
+###
+
+<https://archive.org/details/academictorrents_70e0794e2292fc051a13f05ea6f5b6c16f3d3635>
+
+doi:10.1371/journal.pone.0120448
+
+Single .rar file
+
+### Dataverse
+
+<https://dataverse.rsu.lv/dataset.xhtml?persistentId=doi:10.48510/FK2/IJO02B>
+
+Single excel file
+
+### Dataverse
+
+<https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+doi:10.7910/DVN/CLSFKX
+
+Mulitple files; multiple versions?
+
+API fetch: <https://dataverse.harvard.edu/api/datasets/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX&version=1.1>
+
+ .data.id
+ .data.latestVersion.datasetPersistentId
+ .data.latestVersion.versionNumber, .versionMinorNumber
+ .data.latestVersion.files[]
+ .dataFile
+ .contentType (mimetype)
+ .filename
+ .filesize (int, bytes)
+ .md5
+ .persistendId
+ .description
+ .label (filename?)
+ .version
+
+Single file inside: <https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB>
+
+Download single file: <https://dataverse.harvard.edu/api/access/datafile/:persistentId/?persistentId=doi:10.7910/DVN/CLSFKX/XWEHBB> (redirects to AWS S3)
+
+Dataverse refs:
+- 'doi' and 'hdl' are the two persistentId styles
+- file-level persistentIds are optional, on a per-instance basis: https://guides.dataverse.org/en/latest/installation/config.html#filepidsenabled
diff --git a/notes/examples/html_test_journals.txt b/notes/examples/html_test_journals.txt
new file mode 100644
index 0000000..540dc9f
--- /dev/null
+++ b/notes/examples/html_test_journals.txt
@@ -0,0 +1,153 @@
+
+Good examples of journals to run HTML fulltext extraction on.
+
+## Live Web
+
+d-lib magazine
+ live web
+ no longer active
+ http://www.dlib.org/back.html
+
+NLM technical bulletin
+ https://www.nlm.nih.gov/pubs/techbull/back_issues.html
+
+Genders
+ https://web.archive.org/web/20141227010240/http://www.genders.org:80/index.html
+
+firstmondays
+ live web; now OJS
+
+outhistory.org
+
+http://journal.sjdm.org/
+
+http://whoosh.org/
+
+
+## Vanished (but wayback coverage)
+
+ohmylittledata
+ issn:2551-1289
+ vanished
+ blog format
+ http://web.archive.org/web/20180421061156/https://ohmylittledata.com/
+
+exquisit corpse
+ https://web.archive.org/web/20080521052400/http://corpse.org:80/
+
+Journal of Mundane Behavior
+ https://fatcat.wiki/container/tjwfvrjlunf25ofegccgjjmvya
+ ISSN: 1529-3041
+
+ defunct since ~2010
+ simple HTML articles
+ references
+ http://web.archive.org/web/20100406162007/http:/mundanebehavior.org/index2.htm
+ http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm
+
+War Crimes
+
+ PDF articles (not HTML)
+ http://web.archive.org/web/20120916035741/http:/www.war-crimes.org/
+
+
+## DOAJ Test Articles (HTML)
+
+ zcat doaj_article_data_2020-08-07.json.gz | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.txt
+ => 2,184,954
+
+ cut -f3 -d/ html_fulltext_urls.txt | sort | uniq -c | sort -nr | head -n25
+ 254817 link.springer.com
+ 145159 www.scielo.br
+ 78044 journal.frontiersin.org
+ 77394 www.frontiersin.org
+ 40849 www.dovepress.com
+ 19024 dergipark.org.tr
+ 18758 periodicos.ufsc.br
+ 16346 www.revistas.usp.br
+ 15872 revistas.unal.edu.co
+ 15527 revistas.ucm.es
+ 13669 revistas.usal.es
+ 12640 dergipark.gov.tr
+ 12111 journals.rudn.ru
+ 11839 www.scielosp.org
+ 11277 www.karger.com
+ 10827 www.journals.vu.lt
+ 10318
+ 9854 peerj.com
+ 9100 ojs.unud.ac.id
+ 8581 jurnal.ugm.ac.id
+ 8261 riviste.unimi.it
+ 8012 journals.uran.ua
+ 7454 revistas.pucp.edu.pe
+ 7264 journals.vgtu.lt
+ 7200 publicaciones.banrepcultural.org
+
+ cat html_fulltext_urls.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.filtered.txt
+ => 1,579,257
+
+ zcat doaj_article_data_2020-08-07.json.gz | rg -v '"doi"' | jq '.bibjson.link[]' -c | rg -i '"html"' | rg -v doi.org | rg '"fulltext"' | jq -r .url | pv -l > html_fulltext_urls.no_doi.txt
+ => 560k
+
+ cut -f3 -d/ html_fulltext_urls.no_doi.txt | sort | uniq -c | sort -nr | head -n25
+ 40849 www.dovepress.com
+ 10570 journals.rudn.ru
+ 10494 dergipark.org.tr
+ 10233 revistas.unal.edu.co
+ 9981 dergipark.gov.tr
+ 9428 revistas.usal.es
+ 8292 revistas.ucm.es
+ 7200 publicaciones.banrepcultural.org
+ 6953 revistas.pucp.edu.pe
+ 6000 www.scielosp.org
+ 5962 www.scielo.br
+ 5621 www.richtmann.org
+ 5123 scielo.sld.cu
+ 5067 ojs.unud.ac.id
+ 4838 periodicos.ufsc.br
+ 4736 revistasonlinepre.inap.es
+ 4486 journal.fi
+ 4221 www.seer.ufu.br
+ 3553 revistas.uam.es
+ 3492 revistas.pucsp.br
+ 3060 www.scielo.org.co
+ 2991 scielo.isciii.es
+ 2802 seer.ufrgs.br
+ 2692 revistas.unc.edu.ar
+ 2685 srl.si
+
+ cat html_fulltext_urls.no_doi.txt \
+ | rg -v link.springer.com \
+ | rg -v scielo \
+ | rg -v dergipark.gov.tr \
+ | rg -v frontiersin.org \
+ > html_fulltext_urls.no_doi.filtered.txt
+ => 518,608
+
+ zcat doaj_articles_2020-08-07.html_fulltext_urls.no_doi.filtered.txt.gz | shuf -n20
+ https://revistas.unc.edu.ar/index.php/revistaEF/article/view/22795
+ https://journal.umy.ac.id/index.php/st/article/view/3297
+ https://www.unav.edu/publicaciones/revistas/index.php/estudios-sobre-educacion/article/view/23442
+ http://publications.muet.edu.pk/research_papers/pdf/pdf1615.pdf
+ http://revistas.uncu.edu.ar/ojs/index.php/revistaestudiosclasicos/article/view/1440
+ https://journal.fi/inf/article/view/59430
+ http://journal.uii.ac.id/index.php/Eksakta/article/view/2429
+ https://www.dovepress.com/infant-sleep-and-its-relation-with-cognition-and-growth-a-narrative-re-peer-reviewed-article-NSS
+ https://revistasonlinepre.inap.es/index.php/REALA/article/view/9157
+ http://dergipark.org.tr/dubited/issue/27453/299047?publisher=duzce
+ http://revistas.pucp.edu.pe/index.php/themis/article/view/11862
+ http://journal.bdfish.org/index.php/fisheries/article/view/91
+ https://ojs.unud.ac.id/index.php/buletinfisika/article/view/30567
+ https://www.lithosphere.ru/jour/article/view/779
+ https://journals.hioa.no/index.php/seminar/article/view/2412
+ http://revistas.unicauca.edu.co/index.php/rfcs/article/view/197
+ https://www.kmuj.kmu.edu.pk/article/view/15698
+ http://forodeeducacion.com/ojs/index.php/fde/article/view/82
+ https://revistas.unc.edu.ar/index.php/ConCienciaSocial/article/view/19941
+ http://grbs.library.duke.edu/article/view/3361
+
diff --git a/notes/examples/random_datasets.md b/notes/examples/random_datasets.md
new file mode 100644
index 0000000..b69132c
--- /dev/null
+++ b/notes/examples/random_datasets.md
@@ -0,0 +1,19 @@
+
+Possible external datasets to ingest (which are not entire platforms):
+
+- https://research.google/tools/datasets/
+- https://openslr.org/index.html
+- https://www.kaggle.com/datasets?sort=votes&tasks=true
+- https://archive.ics.uci.edu/ml/datasets.php
+
+Existing archive.org datasets to ingest:
+
+- https://archive.org/details/allthemusicllc-datasets
+
+Papers on archive.org to ingest:
+
+- <https://archive.org/details/journals?and%5B%5D=%21collection%3Aarxiv+%21collection%3Ajstor_ejc+%21collection%3Apubmed&sin=>
+- <https://archive.org/details/biorxiv>
+- <https://archive.org/details/philosophicaltransactions?tab=collection>
+- <https://archive.org/search.php?query=doi%3A%2A>
+- <https://archive.org/details/folkscanomy_academic>