aboutsummaryrefslogtreecommitdiffstats
path: root/notes
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-06 20:01:15 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-06 20:01:15 -0700
commit5dd9e8f6790de403376811a966496b8f612f192e (patch)
treeaa7f3698edcbf8faa1fd1cd620aff44dbd80d33a /notes
parent35ebee00c38436ea8c8a075689d987d80841255a (diff)
downloadsandcrawler-5dd9e8f6790de403376811a966496b8f612f192e.tar.gz
sandcrawler-5dd9e8f6790de403376811a966496b8f612f192e.zip
MAG 2020-03-04 ingest notes to date
Diffstat (limited to 'notes')
-rw-r--r--notes/ingest/2020-03-04_mag.md395
1 files changed, 395 insertions, 0 deletions
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md
new file mode 100644
index 0000000..a5624c2
--- /dev/null
+++ b/notes/ingest/2020-03-04_mag.md
@@ -0,0 +1,395 @@
+
+Rough plan:
+
+- run bulk and/or regular ingest requests for just those of AIT partners (200k?)
+- persist ingest requests (22 million or so)
+- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall)
+- crawl those which are no-capture
+
+
+## Generate Requests
+
+Newer version of `mag_ingest_request.sh` script requires venv with urlcanon
+installed.
+
+Starting with the 2020-01-23 MAG dump, will generate a full ingest request set
+(including DOI `ext_id` when available), with any dominant domains removed (eg,
+arxiv.org):
+
+ export LC_ALL=C
+ cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json
+ => previously 25.6M
+ => 25.6M 2:29:43 [2.85k/s]
+
+ export LC_ALL=C
+ zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json
+ => 4.3M 0:25:45 [2.78k/s]
+
+ export LC_ALL=C
+ cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id
+
+ zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l
+ => 6,504,907
+
+ zcat PaperUrls_mag_url_pmid.txt.gz | wc -l
+ => 4,369,832
+
+ cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l
+ => previously 15,707,405
+ => 15,702,581
+
+ cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l
+ => 0
+ URL encoding seems to be working
+
+## Persist Ingest Requests
+
+First pmid ingest requests, then the all/doi file. The reason to do this order
+is that the all/doi file will have some rows with no DOI (and thus no
+`ext_id`), while the PMID file will not.
+
+ # small sample
+ head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+ Worker: Counter({'total': 10, 'skip-result-fields': 10})
+ JSON lines pushed: Counter({'total': 10, 'pushed': 10})
+
+ cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request -
+ => 4.3M 0:16:46 [4.27k/s]
+ Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0})
+ JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026})
+ => hit a bug on first attempt, which is why total/insert results don't match
+
+ cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request -
+ => 25.6M 2:21:54 [3.01k/s]
+ Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0})
+ JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559})
+
+
+## Crawl/Dupe Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+After just PMID links:
+
+ status | count
+ ---------------------+---------
+ | 3000115
+ success | 1126881
+ no-capture | 69459
+ terminal-bad-status | 30259
+ redirect-loop | 11656
+ no-pdf-link | 2836
+ wrong-mimetype | 1456
+ link-loop | 1259
+ wayback-error | 1232
+ cdx-error | 932
+ null-body | 85
+ petabox-error | 50
+ bad-redirect | 1
+ (13 rows)
+
+After all links:
+
+ SELECT COUNT(*)
+ FROM ingest_request
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag';
+ => 25596563
+
+
+ status | count
+ ---------------------+----------
+ | 21130841
+ success | 3915682
+ no-capture | 391813
+ terminal-bad-status | 76488
+ redirect-loop | 44202
+ wrong-mimetype | 16418
+ no-pdf-link | 10995
+ wayback-error | 3679
+ cdx-error | 3414
+ link-loop | 2098
+ null-body | 709
+ petabox-error | 221
+ bad-gzip-encoding | 2
+ bad-redirect | 1
+ (14 rows)
+
+Somewhat more un-ingested than expected.
+
+Dump requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/mag_noingest_20200305.rows.json';
+ => COPY 21,130,841
+
+Transform and shuf:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz
+ => 21.1M 0:18:57 [18.6k/s]
+
+## Bulk Ingest Partner Output
+
+These are subsets of the full list from potential AIT-S partners; want to run
+these through the pipeline before the full batch. Duplication against the full
+batch should be minimal.
+
+Size:
+
+ bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l
+ 29007
+ bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json
+ 34265 ingest_requests_mag-2020-01-23.cornell.json
+
+Test ingest:
+
+ head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Full ingests:
+
+ cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Bulk Ingest
+
+Shard it into batches of roughly 1 million:
+
+ cd /grande/snapshots/
+ zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json
+
+Add a single batch like:
+
+ cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+ partner ingests (see above)
+ => 2020-03-05 12:49: 118,396
+ 1056543 mag_noingest_20200305.ingest_request.split_00.json
+ => 2020-03-05 14:34: 1,055,224
+ => check on stats/ratios; filter by ingest update time?
+ 1056542 mag_noingest_20200305.ingest_request.split_01.json
+ 1056542 mag_noingest_20200305.ingest_request.split_02.json
+ 1056542 mag_noingest_20200305.ingest_request.split_03.json
+ 1056542 mag_noingest_20200305.ingest_request.split_04.json
+ 1056542 mag_noingest_20200305.ingest_request.split_05.json
+ 1056542 mag_noingest_20200305.ingest_request.split_06.json
+ 1056542 mag_noingest_20200305.ingest_request.split_07.json
+ 1056542 mag_noingest_20200305.ingest_request.split_08.json
+ 1056542 mag_noingest_20200305.ingest_request.split_09.json
+ => 2020-03-05 18:04: 10,009,297
+ => 2020-03-06 16:53: 6,553,946
+ 1056542 mag_noingest_20200305.ingest_request.split_10.json
+ 1056542 mag_noingest_20200305.ingest_request.split_11.json
+ 1056542 mag_noingest_20200305.ingest_request.split_12.json
+ 1056542 mag_noingest_20200305.ingest_request.split_13.json
+ 1056542 mag_noingest_20200305.ingest_request.split_14.json
+ 1056542 mag_noingest_20200305.ingest_request.split_15.json
+ 1056542 mag_noingest_20200305.ingest_request.split_16.json
+ 1056542 mag_noingest_20200305.ingest_request.split_17.json
+ 1056542 mag_noingest_20200305.ingest_request.split_18.json
+ 1056542 mag_noingest_20200305.ingest_request.split_19.json
+ => 2020-03-06 16:59: 17,001,032
+
+Stats from bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ ---------------------+----------
+ no-capture | 12237193
+ success | 11991293
+ no-pdf-link | 521691
+ redirect-loop | 437192
+ terminal-bad-status | 231181
+ link-loop | 92633
+ cdx-error | 33631
+ wrong-mimetype | 28638
+ wayback-error | 19651
+ null-body | 2682
+ petabox-error | 727
+ | 47
+ bad-redirect | 44
+ bad-gzip-encoding | 7
+ (14 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.status != 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ --------------------------------------+---------------------+--------
+ dialnet.unirioja.es | redirect-loop | 240967
+ onlinelibrary.wiley.com | no-pdf-link | 147696
+ agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639
+ iopscience.iop.org | terminal-bad-status | 69591
+ febs.onlinelibrary.wiley.com | no-pdf-link | 49874
+ www.researchgate.net | redirect-loop | 42859
+ journals.sagepub.com | no-pdf-link | 27448
+ papers.ssrn.com | redirect-loop | 27328
+ dialnet.unirioja.es | terminal-bad-status | 20320
+ physoc.onlinelibrary.wiley.com | no-pdf-link | 20232
+ science.sciencemag.org | link-loop | 17811
+ espace.library.uq.edu.au | redirect-loop | 17185
+ bpspubs.onlinelibrary.wiley.com | no-pdf-link | 15785
+ obgyn.onlinelibrary.wiley.com | no-pdf-link | 15301
+ anthrosource.onlinelibrary.wiley.com | no-pdf-link | 13746
+ www.tandfonline.com | no-pdf-link | 13303
+ aasldpubs.onlinelibrary.wiley.com | no-pdf-link | 11070
+ link.springer.com | redirect-loop | 10594
+ www.redalyc.org:9081 | no-pdf-link | 10515
+ watermark.silverchair.com | terminal-bad-status | 9739
+ www.bmj.com | link-loop | 9389
+ www.repository.naturalis.nl | redirect-loop | 8213
+ bjp.rcpsych.org | link-loop | 8045
+ aslopubs.onlinelibrary.wiley.com | no-pdf-link | 7814
+ nph.onlinelibrary.wiley.com | no-pdf-link | 7801
+ iopscience.iop.org | redirect-loop | 7697
+ journals.tubitak.gov.tr | wrong-mimetype | 7159
+ www.biorxiv.org | wrong-mimetype | 7067
+ www.erudit.org | redirect-loop | 6819
+ besjournals.onlinelibrary.wiley.com | no-pdf-link | 6254
+ (30 rows)
+
+Domains to follow-up (eg, sandcrawler ingest tests/tweaks):
+- dialnet.unirioja.es | redirect-loop | 240967
+- www.researchgate.net | redirect-loop | 42859
+- www.redalyc.org:9081 | no-pdf-link | 10515
+- www.repository.naturalis.nl | redirect-loop | 8213
+- bjp.rcpsych.org | link-loop | 8045
+- journals.tubitak.gov.tr | wrong-mimetype | 7159
+- www.erudit.org | redirect-loop | 6819
+
+The dialnet.unirioja.es ones may be worth re-crawling via heritrix?
+
+Top uncrawled domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ---------------------------------+------------+--------
+ ieeexplore.ieee.org | no-capture | 957835
+ link.springer.com | no-capture | 394121
+ www.researchgate.net | no-capture | 376974
+ cyberleninka.ru | no-capture | 376012
+ iopscience.iop.org | no-capture | 348791
+ papers.ssrn.com | no-capture | 286860
+ dergipark.org.tr | no-capture | 217556
+ dialnet.unirioja.es | no-capture | 214398
+ academic.oup.com | no-capture | 212364
+ www.tandfonline.com | no-capture | 148940
+ journals.sagepub.com | no-capture | 144695
+ www.papersearch.net | no-capture | 138986
+ absimage.aps.org | no-capture | 111976
+ apps.dtic.mil | no-capture | 106984
+ www.cambridge.org | no-capture | 97533
+ www.bmj.com | no-capture | 92437
+ bioone.org | no-capture | 87573
+ science.sciencemag.org | no-capture | 75723
+ shodhganga.inflibnet.ac.in:8080 | no-capture | 75395
+ www.jstor.org | no-capture | 73230
+ works.bepress.com | no-capture | 68747
+ www.scielo.org.co | no-capture | 59650
+ hrcak.srce.hr | no-capture | 59332
+ muse.jhu.edu | no-capture | 57828
+ onlinelibrary.wiley.com | no-capture | 55621
+ www.jbc.org | no-capture | 54608
+ www.jstage.jst.go.jp | no-capture | 53631
+ www.redalyc.org | no-capture | 50406
+ lup.lub.lu.se | no-capture | 47469
+ www.dtic.mil | no-capture | 41820
+ (30 rows)
+
+## Heritrix Seedlist Generation
+
+Dump ingest requests (filtered for some domains that don't expect to crawl via
+heritrix):
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ AND ingest_file_result.status = 'no-capture'
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json';
+ => COPY 11714199
+
+ # in sandcrawler pipenv
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json