aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest/2020-07_mag.md
diff options
context:
space:
mode:
Diffstat (limited to 'notes/ingest/2020-07_mag.md')
-rw-r--r--notes/ingest/2020-07_mag.md353
1 files changed, 353 insertions, 0 deletions
diff --git a/notes/ingest/2020-07_mag.md b/notes/ingest/2020-07_mag.md
new file mode 100644
index 0000000..1d33162
--- /dev/null
+++ b/notes/ingest/2020-07_mag.md
@@ -0,0 +1,353 @@
+
+Using 2020-06-25 upstream MAG corpus snapshot.
+
+Ran munging from `scratch:ingest/mag` notes first.
+
+Expecting a couple million new ingest request URLs; this is the first "patch"
+MAG ingest on top of existing already-run requests.
+
+Planning to skip the initial bulk ingest step, on the assumption that new URLs
+have either been ingested already (eg, via continuous ingest pipeline) or need
+crawling.
+
+## Generate Requests
+
+ export LC_ALL=C
+ cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 | pv -l > ingest_requests_mag-2020-06-25.json
+ => 28.7M 2:36:48 [3.06k/s]
+
+ export LC_ALL=C
+ zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 --pmid | pv -l > ingest_requests_mag-2020-06-25.pmid.json
+ => 5.66M 0:29:28 [ 3.2k/s]
+
+## Persist Ingest Requests
+
+ # small sample
+ head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 1000, 'insert-requests': 319, 'update-requests': 0})
+
+ head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+ Worker: Counter({'total': 1000, 'insert-requests': 304, 'update-requests': 0})
+
+ cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 5662486, 'insert-requests': 1984605, 'update-requests': 0})
+
+ cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 28743819, 'insert-requests': 7433465, 'update-requests': 0})
+
+## Crawl/Dupe Status
+
+Overall status for old and new seeds, filtering out large (blocking)
+publishers:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 19477651
+ | 8238898
+ redirect-loop | 2036494
+ link-loop | 1330036
+ no-pdf-link | 1304820
+ terminal-bad-status | 648150
+ no-capture | 545785
+ gateway-timeout | 200143
+ cdx-error | 149995
+ spn2-cdx-lookup-failure | 80010
+ wrong-mimetype | 57052
+ wayback-error | 41032
+ invalid-host-resolution | 37203
+ petabox-error | 11167
+ null-body | 6662
+ spn2-error | 1698
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 77
+ (20 rows)
+
+Just the new seeds:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.created > '2020-06-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+---------
+ | 8238851
+ success | 787174
+ no-capture | 42864
+ redirect-loop | 31718
+ terminal-bad-status | 31493
+ no-pdf-link | 13025
+ cdx-error | 11275
+ wrong-mimetype | 6238
+ link-loop | 3365
+ wayback-error | 748
+ gateway-timeout | 506
+ null-body | 191
+ spn2-cdx-lookup-failure | 99
+ petabox-error | 89
+ invalid-host-resolution | 70
+ spn2-error | 7
+ spn2-error:job-failed | 2
+ spn2-error:soft-time-limit-exceeded | 1
+ bad-gzip-encoding | 1
+ (19 rows)
+
+Where are no-capture results terminating? May need to add or update heritrix
+crawl config so that we get better yield without needing to do SPNv2 crawling.
+
+ SELECT initial_domain, terminal_domain, COUNT(*)
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS initial_domain,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS terminal_domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ ) t1
+ GROUP BY initial_domain, terminal_domain
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ initial_domain | terminal_domain | count
+ ---------------------------------+---------------------+--------
+ www.researchgate.net | | 334145
+ academic.oup.com | | 205820
+ www.tandfonline.com | | 148638
+ journals.sagepub.com | | 144196
+ muse.jhu.edu | | 55957
+ hrcak.srce.hr | | 25317
+ www.omicsonline.org | | 22426
+ link.springer.com | | 21044
+ iopscience.iop.org | | 12385
+ bioone.org | | 9097
+ tandfonline.com | | 8512
+ or.nsfc.gov.cn | | 4823
+ ieeexplore.ieee.org | ieeexplore.ieee.org | 4398
+ pubs.acs.org | | 3708
+ archive-ouverte.unige.ch | | 2743
+ dergipark.ulakbim.gov.tr | | 2677
+ hal.archives-ouvertes.fr | | 1258
+ dergipark.org.tr | | 1207
+ apo.org.au | | 1186
+ spire.sciencespo.fr | | 989
+ cyberleninka.ru | | 895
+ lirias.kuleuven.be | | 855
+ tel.archives-ouvertes.fr | | 786
+ pub.uni-bielefeld.de | | 728
+ www.research-collection.ethz.ch | | 670
+ (25 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status IS NULL)
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200708.rows.json';
+ => 8784683
+
+ # in sandcrawler pipenv
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200708.rows.json > /grande/snapshots/mag_nocapture_20200708.json
+
+Seedlist transform from here on covered in MAG crawl notes.
+
+## Bulk Ingest
+
+Run ingest requests on everything we crawled:
+
+ cat /grande/snapshots/mag_nocapture_20200708.json | | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Small sample:
+
+ head -n1000 /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full run:
+
+ cat /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Updated Overall Stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 24574294
+ redirect-loop | 2633731
+ no-capture | 2458694
+ no-pdf-link | 1896871
+ link-loop | 1510899
+ terminal-bad-status | 878821
+ cdx-error | 387574
+ gateway-timeout | 200246
+ | 170304
+ wayback-error | 97572
+ spn2-cdx-lookup-failure | 80284
+ wrong-mimetype | 65097
+ invalid-host-resolution | 37204
+ petabox-error | 12097
+ null-body | 8549
+ spn2-error | 1706
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 90
+ (20 rows)
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+----------
+ success | 24557382
+ redirect-loop | 2630582
+ no-capture | 1947066
+ no-pdf-link | 1778206
+ link-loop | 1510790
+ terminal-bad-status | 857173
+ cdx-error | 384525
+ gateway-timeout | 200143
+ wayback-error | 96390
+ spn2-cdx-lookup-failure | 80010
+ wrong-mimetype | 64908
+ invalid-host-resolution | 37203
+ petabox-error | 12087
+ null-body | 8548
+ spn2-error | 1698
+ spn2-error:job-failed | 775
+ spn2-error:invalid-url-syntax | 335
+ spn2-error:soft-time-limit-exceeded | 191
+ bad-redirect | 90
+ | 69
+ (20 rows)
+
+Just the new seeds:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.created > '2020-06-20'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------------+---------
+ success | 5860601
+ no-capture | 1489959
+ redirect-loop | 619121
+ no-pdf-link | 473703
+ terminal-bad-status | 234753
+ cdx-error | 231575
+ link-loop | 184093
+ wayback-error | 56068
+ wrong-mimetype | 14046
+ null-body | 2068
+ petabox-error | 1006
+ gateway-timeout | 506
+ spn2-cdx-lookup-failure | 99
+ invalid-host-resolution | 70
+ | 22
+ bad-redirect | 13
+ spn2-error | 7
+ timeout | 3
+ spn2-error:job-failed | 2
+ spn2-error:soft-time-limit-exceeded | 1
+ (20 rows)
+