aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest/2021-08_mag.md
diff options
context:
space:
mode:
Diffstat (limited to 'notes/ingest/2021-08_mag.md')
-rw-r--r--notes/ingest/2021-08_mag.md400
1 files changed, 400 insertions, 0 deletions
diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md
new file mode 100644
index 0000000..5f92196
--- /dev/null
+++ b/notes/ingest/2021-08_mag.md
@@ -0,0 +1,400 @@
+
+Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest.
+Also want to re-ingest some old/failed ingests, now that pipeline/code has
+improved.
+
+Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs.
+
+
+## Persist Ingest Requests
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000})
+
+ zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 22.5M 0:46:00 [8.16k/s]
+ => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585})
+
+Roughly 8.6 million new URLs
+
+## Pre-Crawl Status Counts
+
+Status of combined old and new requests, with some large domains removed:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------+----------
+ success | 26123975
+ | 6664846
+ no-pdf-link | 1859908
+ redirect-loop | 1532405
+ no-capture | 1199126
+ link-loop | 1157010
+ terminal-bad-status | 832362
+ gateway-timeout | 202158
+ spn2-cdx-lookup-failure | 81406
+ wrong-mimetype | 69087
+ invalid-host-resolution | 37262
+ wayback-error | 21340
+ petabox-error | 11237
+ null-body | 9414
+ wayback-content-error | 2199
+ cdx-error | 1893
+ spn2-error | 1741
+ spn2-error:job-failed | 971
+ blocked-cookie | 902
+ spn2-error:invalid-url-syntax | 336
+ (20 rows)
+
+And just the new URLs (note that domain filter shouldn't be required, but
+keeping for consistency):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ | 6664780
+ success | 1957844
+ redirect-loop | 23357
+ terminal-bad-status | 9385
+ no-pdf-link | 8315
+ no-capture | 6892
+ link-loop | 4517
+ wrong-mimetype | 3864
+ cdx-error | 1749
+ blocked-cookie | 842
+ null-body | 747
+ wayback-error | 688
+ wayback-content-error | 570
+ gateway-timeout | 367
+ petabox-error | 340
+ spn2-cdx-lookup-failure | 150
+ read-timeout | 122
+ not-found | 119
+ invalid-host-resolution | 63
+ spn2-error | 23
+ (20 rows)
+
+## Dump Initial Bulk Ingest Requests
+
+Note that this is all-time, not just recent, and will re-process a lot of
+"no-pdf-link":
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-pdf-link'
+ OR ingest_file_result.status = 'cdx-error'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json';
+ => COPY 8526647
+
+Transform to ingest requests:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json
+ => 8.53M 0:03:40
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+Updated stats after running initial bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 5184994
+ no-capture | 3284416
+ redirect-loop | 98685
+ terminal-bad-status | 28733
+ link-loop | 28518
+ blocked-cookie | 22338
+ no-pdf-link | 19073
+ wrong-mimetype | 9122
+ null-body | 2793
+ wayback-error | 2128
+ wayback-content-error | 1233
+ cdx-error | 1198
+ petabox-error | 617
+ gateway-timeout | 395
+ not-found | 130
+ read-timeout | 128
+ | 111
+ invalid-host-resolution | 63
+ spn2-cdx-lookup-failure | 24
+ spn2-error | 20
+ (20 rows)
+
+## Generate Seedlist
+
+For crawling, do a similar (but not identical) dump:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json';
+ => COPY 4599519
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json
+ => 4.60M 0:02:55 [26.2k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+ cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ => DONE
+
+ wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt
+ 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt
+ 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt
+ 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt
+
+## Post-Crawl Bulk Re-Ingest
+
+Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by
+hash, URL agnostic).
+
+Enqueue for buik re-ingest:
+
+ cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => Thu 19 Aug 2021 09:10:59 PM UTC
+
+
+## Post-Ingest Stats
+
+Just the new stuff (compare against above for delta):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 7748241 89.2%
+ no-capture | 429688 4.9%
+ redirect-loop | 172831 2.0%
+ terminal-bad-status | 94029 1.1%
+ no-pdf-link | 86437 1.0%
+ blocked-cookie | 67903 0.8%
+ link-loop | 50622
+ wrong-mimetype | 21064
+ null-body | 6650
+ cdx-error | 3313
+ wayback-error | 2630
+ gateway-timeout | 399
+ petabox-error | 268
+ wayback-content-error | 170
+ not-found | 130
+ read-timeout | 128
+ | 109
+ invalid-host-resolution | 63
+ bad-redirect | 39
+ spn2-error | 20
+ (20 rows)
+
+New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397
+
+Overall success of new batch: 7748241. / 8686315 = 89.2%
+
+And combined (old and new) status again:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%researchgate.net%'
+ AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%'
+ AND ingest_request.base_url NOT LIKE '%omicsonline.org%'
+ AND ingest_request.base_url NOT LIKE '%link.springer.com%'
+ AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%'
+ -- AND ingest_request.created > '2021-06-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 31990062
+ redirect-loop | 1704717
+ no-capture | 1263462
+ link-loop | 1218280
+ blocked-cookie | 1213838
+ no-pdf-link | 1096664
+ terminal-bad-status | 960070
+ gateway-timeout | 202190
+ wrong-mimetype | 86557
+ invalid-host-resolution | 37262
+ null-body | 15443
+ wayback-error | 12839
+ cdx-error | 4047
+ spn2-error | 1731
+ spn2-error:job-failed | 962
+ petabox-error | 463
+ wayback-content-error | 379
+ spn2-error:invalid-url-syntax | 336
+ spn2-error:soft-time-limit-exceeded | 203
+ | 175
+ (20 rows)
+
+New success total: 31990062 - 26123975 = 5,866,087
+
+A full 1,263,462 no-capture that could be attempted... though many of those may
+be excluded for a specific reason.