aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest/2022-07_doaj.md
diff options
context:
space:
mode:
Diffstat (limited to 'notes/ingest/2022-07_doaj.md')
-rw-r--r--notes/ingest/2022-07_doaj.md199
1 files changed, 199 insertions, 0 deletions
diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md
new file mode 100644
index 0000000..7e55633
--- /dev/null
+++ b/notes/ingest/2022-07_doaj.md
@@ -0,0 +1,199 @@
+
+This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
+heritrix bulk crawling, along with JALC and DOAJ URLs.
+
+ export SNAPSHOT=2022-07-20
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
+ # 9.72M 0:36:28 [4.44k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # 9.72M 0:17:04 [9.49k/s]
+ # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
+
+Stats after this load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3165539
+ pdf | | 2078874
+ html | | 1547698
+ html | wrong-scope | 1114332
+ pdf | no-pdf-link | 517261
+ html | success | 388376
+ html | unknown-scope | 242044
+ pdf | no-capture | 179030
+ pdf | terminal-bad-status | 174741
+ html | no-capture | 155323
+ pdf | null-body | 129267
+ pdf | redirect-loop | 127136
+ html | html-resource-no-capture | 117275
+ html | null-body | 100296
+ pdf | blocked-cookie | 71093
+ html | redirect-loop | 65519
+ html | terminal-bad-status | 64856
+ html | blocked-cookie | 64095
+ html | spn2-backoff | 55173
+ pdf | link-loop | 27440
+ html | wrong-mimetype | 26016
+ html | wayback-content-error | 20109
+ xml | | 13624
+ pdf | wrong-mimetype | 8411
+ xml | success | 6899
+ html | petabox-error | 6199
+ html | wayback-error | 5269
+ html | spn2-cdx-lookup-failure | 4635
+ html | spn2-recent-capture | 4527
+ xml | null-body | 2353
+ (30 rows)
+
+## Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
+ # COPY 3962331
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
+ # 3.96M 0:01:47 [36.7k/s]
+
+Top domains:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+ 789988 www.mdpi.com
+ 318142 www.frontiersin.org
+ 226316 link.springer.com
+ 204429 www.scielo.br
+ 201175 www.sciencedirect.com
+ 72852 ieeexplore.ieee.org
+ 68983 dx.doi.org
+ 33286 www.dovepress.com
+ 26020 elifesciences.org
+ 23838 www.cetjournal.it
+ 21102 mab-online.nl
+ 20242 www.revistas.usp.br
+ 16564 periodicos.uem.br
+ 15710 journals.openedition.org
+ 14514 dergipark.org.tr
+ 14072 apcz.umk.pl
+ 13924 ojs.minions.amsterdam
+ 13717 bmgn-lchr.nl
+ 13512 ojstest.minions.amsterdam
+ 10440 journals.asm.org
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # Done
+
+## Stats Again
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- ORDER BY ingest_request.ingest_type, COUNT DESC
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 4704006
+ html | wrong-scope | 1761227
+ html | success | 778165
+ pdf | no-pdf-link | 759805
+ html | no-capture | 382080
+ html | unknown-scope | 313391
+ html | html-resource-no-capture | 292953
+ pdf | no-capture | 290311
+ pdf | terminal-bad-status | 271776
+ pdf | null-body | 129267
+ pdf | blocked-cookie | 108491
+ html | terminal-bad-status | 103014
+ html | null-body | 100296
+ html | blocked-cookie | 88533
+ pdf | | 81517
+ pdf | skip-url-blocklist | 76443
+ html | spn2-backoff | 50615
+ pdf | link-loop | 45516
+ html | wrong-mimetype | 33525
+ html | wayback-content-error | 25535
+ pdf | empty-blob | 21431
+ pdf | redirect-loop | 19795
+ html | petabox-error | 18291
+ html | empty-blob | 14391
+ pdf | wrong-mimetype | 14084
+ html | redirect-loop | 12856
+ xml | success | 10381
+ xml | no-capture | 10008
+ html | skip-url-blocklist | 3294
+ html | cdx-error | 3275
+ (30 rows)
+
+Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
+PDFs with no attempt at all? Maybe a filter, or bogus URLs.
+
+Over 1.5M new PDF success over this crawl iteration period, nice.