aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest
diff options
context:
space:
mode:
Diffstat (limited to 'notes/ingest')
-rw-r--r--notes/ingest/2021-09-02_oai_pmh_patch.md4
-rw-r--r--notes/ingest/2022-03_oaipmh.md40
-rw-r--r--notes/ingest/2022-07-19_dblp.md50
-rw-r--r--notes/ingest/2022-07_doaj.md199
-rw-r--r--notes/ingest/2022-07_targeted.md140
-rw-r--r--notes/ingest/2022-09_oaipmh.md397
6 files changed, 828 insertions, 2 deletions
diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md
index fded7b3..ac808dd 100644
--- a/notes/ingest/2021-09-02_oai_pmh_patch.md
+++ b/notes/ingest/2021-09-02_oai_pmh_patch.md
@@ -1506,8 +1506,8 @@ possible to detect these at ingest time, or earlier at OAI-PMH
harvest/transform time and filter them out.
It may be worthwhile to attempt ingest of multiple existing captures
-(timestamps) in the ingest pipeline. Eg, isntead of chosing a single "best"
-capture, if therea are multiple HTTP 200 status captures, try ingest with each
+(timestamps) in the ingest pipeline. Eg, instead of chosing a single "best"
+capture, if there are multiple HTTP 200 status captures, try ingest with each
(or at least a couple). This is because repository software gets upgraded, so
old "no-capture" or "not found" or "link loop" type captures may work when
recrawled.
diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md
new file mode 100644
index 0000000..d2a8d71
--- /dev/null
+++ b/notes/ingest/2022-03_oaipmh.md
@@ -0,0 +1,40 @@
+
+Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl.
+
+Note that Martin excluded many Indonesian endpoints, will need to follow-up on
+those.
+
+## Prep
+
+Fetch metadata snapshot:
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst
+
+ wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst
+
+Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large):
+
+ zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \
+ | rg -v 'oai:kb.dk:' \
+ | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \
+ | rg -v 'oai:hispana.mcu.es:' \
+ | rg -v 'oai:bnf.fr:' \
+ | rg -v 'oai:ukm.si:' \
+ | rg -v 'oai:biodiversitylibrary.org:' \
+ | rg -v 'oai:hsp.org:' \
+ | rg -v 'oai:repec:' \
+ | rg -v 'oai:n/a:' \
+ | rg -v 'oai:quod.lib.umich.edu:' \
+ | rg -v 'oai:americanae.aecid.es:' \
+ | rg -v 'oai:www.irgrid.ac.cn:' \
+ | rg -v 'oai:espace.library.uq.edu:' \
+ | rg -v 'oai:edoc.mpg.de:' \
+ | rg -v 'oai:bibliotecadigital.jcyl.es:' \
+ | rg -v 'oai:repository.erciyes.edu.tr:' \
+ | rg -v 'oai:krm.or.kr:' \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz
+
+These failed to transform in the expected way; a change in JSON schema from last time?
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md
new file mode 100644
index 0000000..74aeb8d
--- /dev/null
+++ b/notes/ingest/2022-07-19_dblp.md
@@ -0,0 +1,50 @@
+
+Cross-posting from fatcat bulk metadata update/ingest.
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 631k 0:00:11 [54.0k/s]
+
+
+## Post-Crawl Stats
+
+This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run
+2022-09-06:
+
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'dblp'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+-----------------------+--------
+ pdf | success | 305142
+ pdf | no-pdf-link | 192683
+ pdf | no-capture | 42634
+ pdf | terminal-bad-status | 38041
+ pdf | skip-url-blocklist | 31055
+ pdf | link-loop | 9263
+ pdf | wrong-mimetype | 4545
+ pdf | redirect-loop | 3952
+ pdf | empty-blob | 2705
+ pdf | wayback-content-error | 834
+ pdf | wayback-error | 294
+ pdf | petabox-error | 202
+ pdf | blocked-cookie | 155
+ pdf | cdx-error | 115
+ pdf | body-too-large | 66
+ pdf | bad-redirect | 19
+ pdf | timeout | 7
+ pdf | bad-gzip-encoding | 4
+ (18 rows)
+
+That is quite a lot of `no-pdf-link`, might be worth doing a random sample
+and/or re-ingest. And a chunk of `no-capture` to retry.
diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md
new file mode 100644
index 0000000..7e55633
--- /dev/null
+++ b/notes/ingest/2022-07_doaj.md
@@ -0,0 +1,199 @@
+
+This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
+heritrix bulk crawling, along with JALC and DOAJ URLs.
+
+ export SNAPSHOT=2022-07-20
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
+ # 9.72M 0:36:28 [4.44k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # 9.72M 0:17:04 [9.49k/s]
+ # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
+
+Stats after this load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3165539
+ pdf | | 2078874
+ html | | 1547698
+ html | wrong-scope | 1114332
+ pdf | no-pdf-link | 517261
+ html | success | 388376
+ html | unknown-scope | 242044
+ pdf | no-capture | 179030
+ pdf | terminal-bad-status | 174741
+ html | no-capture | 155323
+ pdf | null-body | 129267
+ pdf | redirect-loop | 127136
+ html | html-resource-no-capture | 117275
+ html | null-body | 100296
+ pdf | blocked-cookie | 71093
+ html | redirect-loop | 65519
+ html | terminal-bad-status | 64856
+ html | blocked-cookie | 64095
+ html | spn2-backoff | 55173
+ pdf | link-loop | 27440
+ html | wrong-mimetype | 26016
+ html | wayback-content-error | 20109
+ xml | | 13624
+ pdf | wrong-mimetype | 8411
+ xml | success | 6899
+ html | petabox-error | 6199
+ html | wayback-error | 5269
+ html | spn2-cdx-lookup-failure | 4635
+ html | spn2-recent-capture | 4527
+ xml | null-body | 2353
+ (30 rows)
+
+## Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
+ # COPY 3962331
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
+ # 3.96M 0:01:47 [36.7k/s]
+
+Top domains:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+ 789988 www.mdpi.com
+ 318142 www.frontiersin.org
+ 226316 link.springer.com
+ 204429 www.scielo.br
+ 201175 www.sciencedirect.com
+ 72852 ieeexplore.ieee.org
+ 68983 dx.doi.org
+ 33286 www.dovepress.com
+ 26020 elifesciences.org
+ 23838 www.cetjournal.it
+ 21102 mab-online.nl
+ 20242 www.revistas.usp.br
+ 16564 periodicos.uem.br
+ 15710 journals.openedition.org
+ 14514 dergipark.org.tr
+ 14072 apcz.umk.pl
+ 13924 ojs.minions.amsterdam
+ 13717 bmgn-lchr.nl
+ 13512 ojstest.minions.amsterdam
+ 10440 journals.asm.org
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # Done
+
+## Stats Again
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 4704006
+ html | wrong-scope | 1761227
+ html | success | 778165
+ pdf | no-pdf-link | 759805
+ html | no-capture | 382080
+ html | unknown-scope | 313391
+ html | html-resource-no-capture | 292953
+ pdf | no-capture | 290311
+ pdf | terminal-bad-status | 271776
+ pdf | null-body | 129267
+ pdf | blocked-cookie | 108491
+ html | terminal-bad-status | 103014
+ html | null-body | 100296
+ html | blocked-cookie | 88533
+ pdf | | 81517
+ pdf | skip-url-blocklist | 76443
+ html | spn2-backoff | 50615
+ pdf | link-loop | 45516
+ html | wrong-mimetype | 33525
+ html | wayback-content-error | 25535
+ pdf | empty-blob | 21431
+ pdf | redirect-loop | 19795
+ html | petabox-error | 18291
+ html | empty-blob | 14391
+ pdf | wrong-mimetype | 14084
+ html | redirect-loop | 12856
+ xml | success | 10381
+ xml | no-capture | 10008
+ html | skip-url-blocklist | 3294
+ html | cdx-error | 3275
+ (30 rows)
+
+Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
+PDFs with no attempt at all? Maybe a filter, or bogus URLs.
+
+Over 1.5M new PDF success over this crawl iteration period, nice.
diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md
new file mode 100644
index 0000000..415f23b
--- /dev/null
+++ b/notes/ingest/2022-07_targeted.md
@@ -0,0 +1,140 @@
+
+Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.
+
+ export PATCHDATE=2022-07-29
+ export CRAWLVM=wbgrp-svc279.us.archive.org
+ export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+ COPY (
+ SELECT row_to_json(t) FROM (
+ SELECT ingest_file_result.terminal_url, ingest_request.*
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ (
+ ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ )
+ -- AND ingest_file_result.updated >= '2022-01-12'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status LIKE 'spn2-%'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR (
+ ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ OR ingest_file_result.terminal_status_code = 429
+ )
+ )
+ )
+ AND (
+ ingest_request.link_source = 'doi'
+ OR ingest_request.link_source = 'doaj'
+ OR ingest_request.link_source = 'dblp'
+ OR ingest_request.link_source = 'arxiv'
+ OR ingest_request.link_source = 'pmc'
+ -- OR ingest_request.link_source = 'unpaywall'
+ -- OR ingest_request.link_source = 'oai'
+ )
+
+ AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+ AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+ -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+ ) t
+ ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
+ => COPY 3524573
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+ | rg -v "\\\\" \
+ | jq -r .terminal_url \
+ | rg '://' \
+ | rg -i '^http' \
+ | rg -v '://10\.' \
+ | rg -v '://172\.' \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+ => 3.11M 0:01:08 [45.4k/s]
+
+ # check top domains
+ cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+ 624948 doi.org
+ 382492 www.jstage.jst.go.jp
+ 275087 www.mdpi.com
+ 157134 www.persee.fr
+ 108979 www.sciencedirect.com
+ 94375 www.scielo.br
+ 50834 onlinelibrary.wiley.com
+ 49991 journals.lww.com
+ 30354 www.frontiersin.org
+ 27963 doaj.org
+ 27058 www.e-periodica.ch
+ 24147 dl.acm.org
+ 23389 aclanthology.org
+ 22086 www.research-collection.ethz.ch
+ 21589 medien.die-bonn.de
+ 18866 www.ingentaconnect.com
+ 18583 doi.nrct.go.th
+ 18271 repositories.lib.utexas.edu
+ 17634 hdl.handle.net
+ 16366 archives.datapages.com
+ 15146 cgscholar.com
+ 13987 dl.gi.de
+ 13188 www.degruyter.com
+ 12503 ethos.bl.uk
+ 12304 preprints.jmir.org
+
+ cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+ => done
+
+ scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+ ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+
+## Re-Ingest
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
+ => 3.52M 0:01:37 [36.2k/s]
+
+Ingest:
+
+ cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md
new file mode 100644
index 0000000..ac7c68f
--- /dev/null
+++ b/notes/ingest/2022-09_oaipmh.md
@@ -0,0 +1,397 @@
+
+Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921>
+
+I updated the transform script to block some additional domains.
+
+
+## Prep
+
+Fetch the snapshot:
+
+ cd /srv/sandcrawler/tasks/
+ wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst
+
+Transform to ingest requests:
+
+ cd /srv/sandcrawler/src/python
+ git log | head -n1
+ # commit dfd4605d84712eccb95a63e50b0bcb343642b433
+
+ pipenv shell
+ zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \
+ | ./scripts/oai2ingestrequest.py - \
+ | pv -l \
+ | gzip \
+ > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz
+ # 16.1M 1:01:02 [4.38k/s]
+
+Curious about types, though this would probably be handled at fatcat ingest
+time:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt
+
+ head oai_type_counts.txt -n30
+ 5623867 info:eu-repo/semantics/article
+ 5334928 info:eu-repo/semantics/publishedVersion
+ 3870359 text
+ 1240225 Text
+ 829169 Article
+ 769849 NonPeerReviewed
+ 665700 PeerReviewed
+ 648740 Peer-reviewed Article
+ 547857 article
+ 482906 info:eu-repo/semantics/bachelorThesis
+ 353814 Thesis
+ 329269 Student thesis
+ 262650 info:eu-repo/semantics/conferenceObject
+ 185354 Journal articles
+ 162021 info:eu-repo/semantics/doctoralThesis
+ 152079 Journal Article
+ 150226 Research Article
+ 130217 Conference papers
+ 127255 Artículo revisado por pares
+ 124243 Newspaper
+ 123908 ##rt.metadata.pkp.peerReviewed##
+ 123309 Photograph
+ 122981 info:eu-repo/semantics/masterThesis
+ 116719 Book
+ 108946 Image
+ 108216 Report
+ 107946 Other
+ 103562 masterThesis
+ 103038 info:eu-repo/semantics/other
+ 101404 StillImage
+ [...]
+
+And formats:
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt
+
+ head -n 20 oai_format_counts.txt
+ 11151928 application/pdf
+ 677413 text
+ 561656 text/html
+ 498518 image/jpeg
+ 231219 Text
+ 193638 text/xml
+ 147214 Image
+ 117073 image/jpg
+ 110872 pdf
+ 91323 image/tiff
+ 76948 bib
+ 75393 application/xml
+ 70244 Digitized from 35 mm. microfilm.
+ 68206 mods
+ 59227 PDF
+ 57677 application/epub+zip
+ 57602 application/octet-stream
+ 52072 text/plain
+ 51620 application/msword
+ 47227 audio/mpeg
+
+Also, just overall size (number of records):
+
+ zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l
+ # 20,840,301
+
+Next load in to sandcrawler DB:
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request -
+
+ Traceback (most recent call last):
+ File "./persist_tool.py", line 311, in <module>
+ main()
+ File "./persist_tool.py", line 307, in main
+ args.func(args)
+ File "./persist_tool.py", line 119, in run_ingest_request
+ pusher.run()
+ File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run
+ self.worker.push_batch(batch)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch
+ resp = self.db.insert_ingest_request(self.cur, irequests)
+ File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request
+ resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True)
+ File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values
+ cur.execute(b''.join(parts))
+ psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx"
+ DETAIL: Index row references tuple (6893121,3) in relation "ingest_request".
+ HINT: Values larger than 1/3 of a buffer page cannot be indexed.
+ Consider a function index of an MD5 hash of the value, or use full text indexing.
+ 15.7M 0:41:48 [6.27k/s]
+
+Darn, this means we won't get reasonable stats about how many rows were
+inserted/updated.
+
+Patched the persist tool to skip very long URLs, and ran again (backwards, just
+URLs which didn't get inserted already):
+
+ zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \
+ | tac \
+ | head -n1000000 \
+ | pv -l \
+ | ./persist_tool.py ingest-request -
+ # 1.00M 0:03:04 [5.41k/s]
+ # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0})
+
+Status of just the new lines:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+---------
+ | 6398455
+ success | 540219
+ no-pdf-link | 41316
+ link-loop | 23871
+ no-capture | 11350
+ redirect-loop | 8315
+ wrong-mimetype | 2394
+ terminal-bad-status | 1540
+ null-body | 1038
+ cdx-error | 272
+ empty-blob | 237
+ petabox-error | 213
+ wayback-error | 186
+ blocked-cookie | 107
+ timeout | 47
+ wayback-content-error | 26
+ spn2-cdx-lookup-failure | 21
+ skip-url-blocklist | 16
+ spn2-backoff | 15
+ body-too-large | 13
+ (20 rows)
+
+
+## Bulk Ingest
+
+Should already have filtered domains/prefixes in transform script, so not
+including filters here.
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json';
+ # COPY 6398455
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json
+ # 6.40M 0:02:18 [46.2k/s]
+
+ cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # DONE
+
+Expect this ingest to take a week or so.
+
+Then, run stats again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ no-capture | 3617175
+ success | 2775036
+ no-pdf-link | 449298
+ link-loop | 74260
+ terminal-bad-status | 47819
+ wrong-mimetype | 20195
+ redirect-loop | 18197
+ empty-blob | 12127
+ cdx-error | 3038
+ skip-url-blocklist | 2630
+ wayback-error | 2599
+ petabox-error | 2354
+ wayback-content-error | 1617
+ blocked-cookie | 1293
+ null-body | 1038
+ body-too-large | 670
+ | 143
+ bad-gzip-encoding | 64
+ timeout | 47
+ spn2-cdx-lookup-failure | 20
+ (20 rows)
+
+
+## Crawl Seedlist
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ AND (
+ ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'redirect-loop'
+ OR ingest_file_result.status = 'terminal-bad-status'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'timeout'
+ OR ingest_file_result.status = 'wayback-content-error'
+ )
+ ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json';
+ => COPY 3692846
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | pv -l \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json
+ => 3.69M 0:01:19 [46.6k/s]
+
+This will be used for re-ingest later. For now, extract URLs:
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | jq .base_url -r \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ => 3.66M 0:00:59 [61.8k/s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \
+ | rg '"terminal_url"' \
+ | jq -r .result.terminal_url \
+ | rg -v ^null$ \
+ | sort -u -S 4G \
+ | pv -l \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ => 0.00 0:00:05 [0.00 /s]
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | awk '{print "F+ " $1}' \
+ | shuf \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+What domains are we crawling?
+
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \
+ | sort -u -S 4G \
+ | cut -d/ -f3 \
+ | sort \
+ | uniq -c \
+ | sort -nr \
+ > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+
+ head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt
+ 91899 raco.cat
+ 70116 islandora.wrlc.org
+ 68708 urn.kb.se
+ 63726 citeseerx.ist.psu.edu
+ 50370 publications.rwth-aachen.de
+ 44885 urn.nsk.hr
+ 38429 server15795.contentdm.oclc.org
+ 33041 periodicos.ufpb.br
+ 32519 nbn-resolving.org
+ 31990 www.ajol.info
+ 24745 hal.archives-ouvertes.fr
+ 22569 id.nii.ac.jp
+ 17239 tilburguniversity.on.worldcat.org
+ 15873 dspace.nbuv.gov.ua
+ 15436 digitalcommons.wustl.edu
+ 14885 www.iiste.org
+ 14623 www.manchester.ac.uk
+ 14033 nbn-resolving.de
+ 13999 opus4.kobv.de
+ 13689 www.redalyc.org
+
+Sizes:
+
+ wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt
+ 0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt
+ 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule
+
+
+Copy seedlist to crawler:
+
+ # as regular user
+ scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp
+
+## Post-Crawl Bulk Ingest
+
+ # ran 2022-11-16, after crawl cleanup
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -----------------------+---------
+ success | 4721164 +1,946,128
+ no-pdf-link | 1116290
+ no-capture | 673939
+ terminal-bad-status | 232217
+ link-loop | 148544
+ wrong-mimetype | 68841
+ redirect-loop | 26262
+ empty-blob | 17759
+ cdx-error | 6570
+ blocked-cookie | 4026
+ blocked-wall | 3054
+ skip-url-blocklist | 2924
+ body-too-large | 2404
+ bad-redirect | 1565
+ wayback-error | 1320
+ petabox-error | 1083
+ null-body | 1038
+ wayback-content-error | 264
+ bad-gzip-encoding | 150
+ | 143
+ (20 rows)
+