From 5dd8785d710cf7d067afdc691069bfa74406e06a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 May 2020 14:47:17 -0700 Subject: ingests: normalize file names; commit updates --- notes/ingest/2019-10-23_testing.md | 8 + notes/ingest/20191023_testing.md | 8 - notes/ingest/2020-01-14_bulk.md | 26 ++ notes/ingest/2020-02-14_unpaywall_ingest.md | 624 ---------------------------- notes/ingest/2020-02_unpaywall.md | 624 ++++++++++++++++++++++++++++ notes/ingest/2020-03-04_mag.md | 576 ------------------------- notes/ingest/2020-03-oa_but_not_marked.md | 25 ++ notes/ingest/2020-03_mag.md | 576 +++++++++++++++++++++++++ notes/ingest/2020-03_s2.md | 35 ++ notes/ingest/2020-03_s2_ingest.md | 35 -- notes/ingest/2020-04-07_datacite.md | 121 ------ notes/ingest/2020-04-07_unpaywall.md | 63 --- notes/ingest/2020-04_datacite.md | 121 ++++++ notes/ingest/2020-04_unpaywall.md | 129 ++++++ notes/ingest/2020-05_oai_pmh.md | 125 ++++++ notes/ingest/20200114_bulk_ingests.md | 26 -- 16 files changed, 1669 insertions(+), 1453 deletions(-) create mode 100644 notes/ingest/2019-10-23_testing.md delete mode 100644 notes/ingest/20191023_testing.md create mode 100644 notes/ingest/2020-01-14_bulk.md delete mode 100644 notes/ingest/2020-02-14_unpaywall_ingest.md create mode 100644 notes/ingest/2020-02_unpaywall.md delete mode 100644 notes/ingest/2020-03-04_mag.md create mode 100644 notes/ingest/2020-03-oa_but_not_marked.md create mode 100644 notes/ingest/2020-03_mag.md create mode 100644 notes/ingest/2020-03_s2.md delete mode 100644 notes/ingest/2020-03_s2_ingest.md delete mode 100644 notes/ingest/2020-04-07_datacite.md delete mode 100644 notes/ingest/2020-04-07_unpaywall.md create mode 100644 notes/ingest/2020-04_datacite.md create mode 100644 notes/ingest/2020-04_unpaywall.md create mode 100644 notes/ingest/2020-05_oai_pmh.md delete mode 100644 notes/ingest/20200114_bulk_ingests.md (limited to 'notes') diff --git a/notes/ingest/2019-10-23_testing.md b/notes/ingest/2019-10-23_testing.md new file mode 100644 index 0000000..481c4e2 --- /dev/null +++ b/notes/ingest/2019-10-23_testing.md @@ -0,0 +1,8 @@ + +exported not-archived DOIs for elife, as well as general list. + + wc -l recent\ missing\ oa\ releases.csv + 161828 recent missing oa releases.csv + + wc -l missing\ elife\ DOIs.csv + 1779 missing elife DOIs.csv diff --git a/notes/ingest/20191023_testing.md b/notes/ingest/20191023_testing.md deleted file mode 100644 index 481c4e2..0000000 --- a/notes/ingest/20191023_testing.md +++ /dev/null @@ -1,8 +0,0 @@ - -exported not-archived DOIs for elife, as well as general list. - - wc -l recent\ missing\ oa\ releases.csv - 161828 recent missing oa releases.csv - - wc -l missing\ elife\ DOIs.csv - 1779 missing elife DOIs.csv diff --git a/notes/ingest/2020-01-14_bulk.md b/notes/ingest/2020-01-14_bulk.md new file mode 100644 index 0000000..9d05cda --- /dev/null +++ b/notes/ingest/2020-01-14_bulk.md @@ -0,0 +1,26 @@ + +Generate ingest requests from arabesque: + + zcat /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source arxiv --extid-type arxiv --release-stage submitted - | shuf > /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json + + zcat /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source pmc --extid-type pmcid - | shuf > /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json + + +Quick tests locally: + + time head -n100 /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_arxiv.json + time head -n100 /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_pubmed.json + +These are all wayback success; looking good! Single threaded, from home laptop +(over tunnel), took about 9 minutes, or 5.5sec/pdf. That's pretty slow even +with 30x parallelism. Should re-test on actual server. GROBID pre-check should +help? + +With new bulk topic: + + head PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Ok, let them rip: + + cat PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ARXIV-CRAWL-2019-10.arabesque.ingest_request.json | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02-14_unpaywall_ingest.md deleted file mode 100644 index e18a2ff..0000000 --- a/notes/ingest/2020-02-14_unpaywall_ingest.md +++ /dev/null @@ -1,624 +0,0 @@ - -## Stats and Things - - zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | jq .oa_locations[].url_for_pdf -r | rg -v ^null | cut -f3 -d/ | sort | uniq -c | sort -nr > top_domains.txt - -## Transform - - zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | ./unpaywall2ingestrequest.py - | pv -l > /dev/null - => 22M 1:31:25 [ 4k/s] - -Shard it into batches of roughly 1 million (all are 1098096 +/- 1): - - zcat unpaywall_snapshot_2019-11-22.ingest_request.shuf.json.gz | split -n r/20 -d - unpaywall_snapshot_2019-11-22.ingest_request.split_ --additional-suffix=.json - -Test ingest: - - head -n200 unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -Add a single batch like: - - cat unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -## Progress/Status - -There are 21,961,928 lines total, in batches of 1,098,097. - - unpaywall_snapshot_2019-11-22.ingest_request.split_00.json - => 2020-02-24 21:05 local: 1,097,523 ~22 results/sec (combined) - => 2020-02-25 10:35 local: 0 - unpaywall_snapshot_2019-11-22.ingest_request.split_01.json - unpaywall_snapshot_2019-11-22.ingest_request.split_02.json - unpaywall_snapshot_2019-11-22.ingest_request.split_03.json - unpaywall_snapshot_2019-11-22.ingest_request.split_04.json - => 2020-02-25 11:26 local: 4,388,997 - => 2020-02-25 10:14 local: 1,115,821 - => 2020-02-26 16:00 local: 265,116 - unpaywall_snapshot_2019-11-22.ingest_request.split_05.json - unpaywall_snapshot_2019-11-22.ingest_request.split_06.json - unpaywall_snapshot_2019-11-22.ingest_request.split_07.json - unpaywall_snapshot_2019-11-22.ingest_request.split_08.json - unpaywall_snapshot_2019-11-22.ingest_request.split_09.json - => 2020-02-26 16:01 local: 6,843,708 - => 2020-02-26 16:31 local: 4,839,618 - => 2020-02-28 10:30 local: 2,619,319 - unpaywall_snapshot_2019-11-22.ingest_request.split_10.json - unpaywall_snapshot_2019-11-22.ingest_request.split_11.json - unpaywall_snapshot_2019-11-22.ingest_request.split_12.json - unpaywall_snapshot_2019-11-22.ingest_request.split_13.json - unpaywall_snapshot_2019-11-22.ingest_request.split_14.json - unpaywall_snapshot_2019-11-22.ingest_request.split_15.json - unpaywall_snapshot_2019-11-22.ingest_request.split_16.json - unpaywall_snapshot_2019-11-22.ingest_request.split_17.json - unpaywall_snapshot_2019-11-22.ingest_request.split_18.json - unpaywall_snapshot_2019-11-22.ingest_request.split_19.json - => 2020-02-28 10:50 local: 13,551,887 - => 2020-03-01 23:38 local: 4,521,076 - => 2020-03-02 10:45 local: 2,827,071 - => 2020-03-02 21:06 local: 1,257,176 - added about 500k bulk re-ingest to try and work around cdx errors - => 2020-03-02 21:30 local: 1,733,654 - -## Investigate Failures - -Guessing than some domains are ultimately going to need direct "recrawl" via -SPNv2. - - -- top domain failures for unpaywall GWB history ingest - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - ) t1 - WHERE t1.domain != '' - AND t1.status != 'success' - AND t1.status != 'no-capture' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - domain | status | count - -----------------------------------+---------------------+-------- - watermark.silverchair.com | terminal-bad-status | 258432 - www.tandfonline.com | no-pdf-link | 203873 - journals.sagepub.com | no-pdf-link | 126317 - iopscience.iop.org | terminal-bad-status | 112526 - files-journal-api.frontiersin.org | terminal-bad-status | 112499 - pubs.acs.org | no-pdf-link | 94772 - www.degruyter.com | redirect-loop | 89801 - www.ahajournals.org | no-pdf-link | 84025 - society.kisti.re.kr | no-pdf-link | 72849 - www.nature.com | redirect-loop | 53575 - babel.hathitrust.org | terminal-bad-status | 41063 - www.ncbi.nlm.nih.gov | redirect-loop | 40363 - scialert.net | no-pdf-link | 38340 - www.degruyter.com | terminal-bad-status | 34913 - www.journal.csj.jp | no-pdf-link | 30881 - espace.library.uq.edu.au | redirect-loop | 24570 - www.jci.org | redirect-loop | 24409 - aip.scitation.org | wrong-mimetype | 22144 - www.vr-elibrary.de | no-pdf-link | 17436 - www.biorxiv.org | wrong-mimetype | 15524 - ajph.aphapublications.org | no-pdf-link | 15083 - zookeys.pensoft.net | redirect-loop | 14867 - dialnet.unirioja.es | redirect-loop | 14486 - asa.scitation.org | wrong-mimetype | 14261 - www.nrcresearchpress.com | no-pdf-link | 14254 - dl.acm.org | redirect-loop | 14223 - osf.io | redirect-loop | 14103 - www.oecd-ilibrary.org | redirect-loop | 12835 - journals.sagepub.com | redirect-loop | 12229 - iopscience.iop.org | redirect-loop | 11825 - (30 rows) - - -- top no-capture terminal domains - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - ) t1 - WHERE t1.domain != '' - AND t1.status = 'no-capture' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - => very few from any domain, interesting. Guess many of these are URLs that have truely never been crawled - - -- top no-capture base domains - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - ) t1 - WHERE t1.domain != '' - AND t1.status = 'no-capture' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - domain | status | count - ------------------------------+------------+-------- - academic.oup.com | no-capture | 429888 - www.nature.com | no-capture | 273825 - dergipark.org.tr | no-capture | 119847 - www.biodiversitylibrary.org | no-capture | 110220 - escholarship.org | no-capture | 106307 - onlinelibrary.wiley.com | no-capture | 89771 - journals.sagepub.com | no-capture | 79297 - www.cell.com | no-capture | 64242 - deepblue.lib.umich.edu | no-capture | 58080 - babel.hathitrust.org | no-capture | 52286 - hal.archives-ouvertes.fr | no-capture | 48549 - iopscience.iop.org | no-capture | 42591 - dash.harvard.edu | no-capture | 40767 - www.tandfonline.com | no-capture | 40638 - discovery.ucl.ac.uk | no-capture | 40633 - www.jstage.jst.go.jp | no-capture | 39780 - www.doiserbia.nb.rs | no-capture | 39261 - dspace.mit.edu | no-capture | 37703 - zookeys.pensoft.net | no-capture | 34562 - repositorio.unesp.br | no-capture | 34437 - ashpublications.org | no-capture | 34112 - www.cambridge.org | no-capture | 33959 - kclpure.kcl.ac.uk | no-capture | 31455 - society.kisti.re.kr | no-capture | 30427 - pure.mpg.de | no-capture | 27650 - download.atlantis-press.com | no-capture | 27253 - dialnet.unirioja.es | no-capture | 26886 - link.springer.com | no-capture | 26257 - www.valueinhealthjournal.com | no-capture | 24798 - dspace.library.uu.nl | no-capture | 23234 - (30 rows) - - -- top no-capture base domains - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - ) t1 - WHERE t1.domain != '' - AND t1.status = 'no-capture' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - domain | status | count - ------------------------------+------------+-------- - academic.oup.com | no-capture | 429888 - www.nature.com | no-capture | 273825 - dergipark.org.tr | no-capture | 119847 - www.biodiversitylibrary.org | no-capture | 110220 - escholarship.org | no-capture | 106307 - onlinelibrary.wiley.com | no-capture | 89771 - journals.sagepub.com | no-capture | 79297 - www.cell.com | no-capture | 64242 - deepblue.lib.umich.edu | no-capture | 58080 - babel.hathitrust.org | no-capture | 52286 - hal.archives-ouvertes.fr | no-capture | 48549 - iopscience.iop.org | no-capture | 42591 - dash.harvard.edu | no-capture | 40767 - www.tandfonline.com | no-capture | 40638 - discovery.ucl.ac.uk | no-capture | 40633 - www.jstage.jst.go.jp | no-capture | 39780 - www.doiserbia.nb.rs | no-capture | 39261 - dspace.mit.edu | no-capture | 37703 - zookeys.pensoft.net | no-capture | 34562 - repositorio.unesp.br | no-capture | 34437 - ashpublications.org | no-capture | 34112 - www.cambridge.org | no-capture | 33959 - kclpure.kcl.ac.uk | no-capture | 31455 - society.kisti.re.kr | no-capture | 30427 - pure.mpg.de | no-capture | 27650 - download.atlantis-press.com | no-capture | 27253 - dialnet.unirioja.es | no-capture | 26886 - link.springer.com | no-capture | 26257 - www.valueinhealthjournal.com | no-capture | 24798 - dspace.library.uu.nl | no-capture | 23234 - (30 rows) - - -- how many ingest requests not crawled at all? - SELECT count(*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status IS NULL; - => 0 - - -- "cookie absent" terminal pages, by domain - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' - ) t1 - WHERE t1.domain != '' - AND t1.status != 'success' - AND t1.status != 'no-capture' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - domain | status | count - --------------------------------+----------------+-------- - journals.sagepub.com | no-pdf-link | 126295 - www.tandfonline.com | no-pdf-link | 116690 - pubs.acs.org | no-pdf-link | 94619 - www.ahajournals.org | no-pdf-link | 84016 - www.journal.csj.jp | no-pdf-link | 30881 - aip.scitation.org | wrong-mimetype | 22143 - www.vr-elibrary.de | no-pdf-link | 17436 - ajph.aphapublications.org | no-pdf-link | 15080 - asa.scitation.org | wrong-mimetype | 14261 - www.nrcresearchpress.com | no-pdf-link | 14253 - journals.ametsoc.org | no-pdf-link | 10500 - www.journals.uchicago.edu | no-pdf-link | 6917 - www.icevirtuallibrary.com | no-pdf-link | 6484 - www.journals.uchicago.edu | wrong-mimetype | 6191 - www.healthaffairs.org | no-pdf-link | 5732 - pubsonline.informs.org | no-pdf-link | 5672 - pinnacle-secure.allenpress.com | no-pdf-link | 5013 - www.worldscientific.com | no-pdf-link | 4560 - www.ajronline.org | wrong-mimetype | 4523 - ehp.niehs.nih.gov | no-pdf-link | 4514 - www.future-science.com | no-pdf-link | 4091 - pubs.acs.org | wrong-mimetype | 4015 - aip.scitation.org | no-pdf-link | 3916 - www.futuremedicine.com | no-pdf-link | 3821 - asa.scitation.org | no-pdf-link | 3644 - www.liebertpub.com | no-pdf-link | 3345 - physicstoday.scitation.org | no-pdf-link | 3005 - pubs.cif-ifc.org | no-pdf-link | 2761 - epubs.siam.org | wrong-mimetype | 2583 - www.ajronline.org | no-pdf-link | 2563 - (30 rows) - - -- "cookie absent" terminal pages, by domain - SELECT count(*) - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status != 'success' - AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'; - - => 654885 - - -- NOT "cookie absent" terminal page failures, total count - SELECT count(*) - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status != 'success' - AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'; - - => 1403837 - -Looks like these domains are almost all "cookieAbsent" blocking: -- journals.sagepub.com -- pubs.acs.org -- ahajournals.org -- www.journal.csj.jp -- aip.scitation.org - -Grab some individual URLs to test: - - SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status != 'success' - AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent' - ORDER BY updated DESC - LIMIT 25; - -NOT cookieAbsent testing with regular ingest tool: -- iopscience.iop.org, terminal-bad-status, SPNv2 fetch, success -- academic.oup.com => silverchair, terminal-bad-status, SPNv2 fetch, succes -- osf.io success - - SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status != 'success' - AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' - ORDER BY updated DESC - LIMIT 25; - -cookieAbsent testing with regular ingest tool: -- www.tandfonline.com failure (no-pdf-link via wayback), but force-recrawl works - -The main distinguisher is status. terminal-bad-status can be ingested (live) -successfully, while no-pdf-link, redirect-loop, etc need to be re-crawled. - -## Heritrix Plan - -Generate following ingest request batches: - -- no-capture status from unpaywall -- all other failures except /cookieAbsent -- /cookieAbsent failures - -Plan will be to crawl no-capture first (to completion), then try the other -non-/cookieAbsent failures. /cookieAbsent means we'll need to use SPNv2. - -Because there are so few "no-capture on second hop" cases, will not enqueue -both terminal urls and base urls, only base urls. - -Should definitely skip/filter: - -- www.ncbi.nlm.nih.gov - -## Ingest Request Export - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status = 'no-capture' - ) TO '/grande/snapshots/unpaywall_nocapture_20200304.rows.json'; - => 4,855,142 - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status != 'success' - AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent' - ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json'; - => 1,403,837 - - ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200304.rows.json > unpaywall_nocapture_20200304.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json > unpaywall_fail_nocookie_20200304.json - -Note: will probably end up re-running the below after crawling+ingesting the above: - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status != 'success' - AND ingest_file_result.status = 'terminal-bad-status' - AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' - ) TO '/grande/snapshots/unpaywall_fail_cookie_badstatus_20200304.rows.json'; - => 0 - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status != 'success' - AND ingest_file_result.status != 'terminal-bad-status' - AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' - ) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json'; - => 654,885 - -## Batch Ingest - -Test small batch: - - head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -Full batch: - - cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - - # there was a broken line in there, so... - # parse error: Expected separator between values at line 1367873, column 175 - # tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c > /dev/null - tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -Note that the crawl is not entirely complete and not all CDX seem to have been -loaded, so may need to iterate. About 10% are still "no capture". May want or -need to additionally crawl the terminal URLs, not the base URLs. - -## Post-ingest stats - -Overall status: - - SELECT ingest_file_result.status, COUNT(*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - GROUP BY status - ORDER BY COUNT DESC - LIMIT 20; - - status | count - -------------------------+---------- - success | 17354494 - no-pdf-link | 1471076 - no-capture | 1135992 - redirect-loop | 837842 - terminal-bad-status | 803081 - cdx-error | 219746 - wrong-mimetype | 100723 - link-loop | 16013 - wayback-error | 12448 - null-body | 9444 - redirects-exceeded | 600 - petabox-error | 411 - bad-redirect | 17 - bad-gzip-encoding | 4 - spn2-cdx-lookup-failure | 3 - gateway-timeout | 1 - spn2-error:job-failed | 1 - spn2-error | 1 - (18 rows) - -Failures by domain: - - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - ) t1 - WHERE t1.domain != '' - AND t1.status != 'success' - AND t1.status != 'no-capture' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - domain | status | count - -----------------------------------+---------------------+-------- - academic.oup.com | no-pdf-link | 330211 - watermark.silverchair.com | terminal-bad-status | 324599 - www.tandfonline.com | no-pdf-link | 242724 - journals.sagepub.com | no-pdf-link | 202050 - iopscience.iop.org | terminal-bad-status | 144063 - files-journal-api.frontiersin.org | terminal-bad-status | 121719 - pubs.acs.org | no-pdf-link | 104535 - www.ahajournals.org | no-pdf-link | 102653 - society.kisti.re.kr | no-pdf-link | 101787 - www.degruyter.com | redirect-loop | 95130 - www.nature.com | redirect-loop | 87534 - onlinelibrary.wiley.com | no-pdf-link | 84432 - www.cell.com | redirect-loop | 61496 - www.degruyter.com | terminal-bad-status | 42919 - babel.hathitrust.org | terminal-bad-status | 41813 - www.ncbi.nlm.nih.gov | redirect-loop | 40488 - scialert.net | no-pdf-link | 38341 - ashpublications.org | no-pdf-link | 34889 - dialnet.unirioja.es | terminal-bad-status | 32076 - www.journal.csj.jp | no-pdf-link | 30881 - pure.mpg.de | redirect-loop | 26163 - www.jci.org | redirect-loop | 24701 - espace.library.uq.edu.au | redirect-loop | 24591 - www.valueinhealthjournal.com | redirect-loop | 23740 - www.vr-elibrary.de | no-pdf-link | 23332 - aip.scitation.org | wrong-mimetype | 22144 - osf.io | redirect-loop | 18513 - www.journals.elsevier.com | no-pdf-link | 16710 - www.spandidos-publications.com | redirect-loop | 15711 - www.biorxiv.org | wrong-mimetype | 15513 - (30 rows) - -Dump lists for another iteration of bulk ingest: - - COPY ( - SELECT row_to_json(ingest_request.*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status = 'no-capture' - ) TO '/grande/snapshots/unpaywall_nocapture_20200323.rows.json'; - => 278,876 - - COPY ( - SELECT row_to_json(ingest_request.*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND ingest_file_result.status != 'success' - AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent' - ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200323.rows.json'; - => - - - ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200323.rows.json > unpaywall_nocapture_20200323.json - - cat unpaywall_nocapture_20200323.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - diff --git a/notes/ingest/2020-02_unpaywall.md b/notes/ingest/2020-02_unpaywall.md new file mode 100644 index 0000000..e18a2ff --- /dev/null +++ b/notes/ingest/2020-02_unpaywall.md @@ -0,0 +1,624 @@ + +## Stats and Things + + zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | jq .oa_locations[].url_for_pdf -r | rg -v ^null | cut -f3 -d/ | sort | uniq -c | sort -nr > top_domains.txt + +## Transform + + zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | ./unpaywall2ingestrequest.py - | pv -l > /dev/null + => 22M 1:31:25 [ 4k/s] + +Shard it into batches of roughly 1 million (all are 1098096 +/- 1): + + zcat unpaywall_snapshot_2019-11-22.ingest_request.shuf.json.gz | split -n r/20 -d - unpaywall_snapshot_2019-11-22.ingest_request.split_ --additional-suffix=.json + +Test ingest: + + head -n200 unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Add a single batch like: + + cat unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Progress/Status + +There are 21,961,928 lines total, in batches of 1,098,097. + + unpaywall_snapshot_2019-11-22.ingest_request.split_00.json + => 2020-02-24 21:05 local: 1,097,523 ~22 results/sec (combined) + => 2020-02-25 10:35 local: 0 + unpaywall_snapshot_2019-11-22.ingest_request.split_01.json + unpaywall_snapshot_2019-11-22.ingest_request.split_02.json + unpaywall_snapshot_2019-11-22.ingest_request.split_03.json + unpaywall_snapshot_2019-11-22.ingest_request.split_04.json + => 2020-02-25 11:26 local: 4,388,997 + => 2020-02-25 10:14 local: 1,115,821 + => 2020-02-26 16:00 local: 265,116 + unpaywall_snapshot_2019-11-22.ingest_request.split_05.json + unpaywall_snapshot_2019-11-22.ingest_request.split_06.json + unpaywall_snapshot_2019-11-22.ingest_request.split_07.json + unpaywall_snapshot_2019-11-22.ingest_request.split_08.json + unpaywall_snapshot_2019-11-22.ingest_request.split_09.json + => 2020-02-26 16:01 local: 6,843,708 + => 2020-02-26 16:31 local: 4,839,618 + => 2020-02-28 10:30 local: 2,619,319 + unpaywall_snapshot_2019-11-22.ingest_request.split_10.json + unpaywall_snapshot_2019-11-22.ingest_request.split_11.json + unpaywall_snapshot_2019-11-22.ingest_request.split_12.json + unpaywall_snapshot_2019-11-22.ingest_request.split_13.json + unpaywall_snapshot_2019-11-22.ingest_request.split_14.json + unpaywall_snapshot_2019-11-22.ingest_request.split_15.json + unpaywall_snapshot_2019-11-22.ingest_request.split_16.json + unpaywall_snapshot_2019-11-22.ingest_request.split_17.json + unpaywall_snapshot_2019-11-22.ingest_request.split_18.json + unpaywall_snapshot_2019-11-22.ingest_request.split_19.json + => 2020-02-28 10:50 local: 13,551,887 + => 2020-03-01 23:38 local: 4,521,076 + => 2020-03-02 10:45 local: 2,827,071 + => 2020-03-02 21:06 local: 1,257,176 + added about 500k bulk re-ingest to try and work around cdx errors + => 2020-03-02 21:30 local: 1,733,654 + +## Investigate Failures + +Guessing than some domains are ultimately going to need direct "recrawl" via +SPNv2. + + -- top domain failures for unpaywall GWB history ingest + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + -----------------------------------+---------------------+-------- + watermark.silverchair.com | terminal-bad-status | 258432 + www.tandfonline.com | no-pdf-link | 203873 + journals.sagepub.com | no-pdf-link | 126317 + iopscience.iop.org | terminal-bad-status | 112526 + files-journal-api.frontiersin.org | terminal-bad-status | 112499 + pubs.acs.org | no-pdf-link | 94772 + www.degruyter.com | redirect-loop | 89801 + www.ahajournals.org | no-pdf-link | 84025 + society.kisti.re.kr | no-pdf-link | 72849 + www.nature.com | redirect-loop | 53575 + babel.hathitrust.org | terminal-bad-status | 41063 + www.ncbi.nlm.nih.gov | redirect-loop | 40363 + scialert.net | no-pdf-link | 38340 + www.degruyter.com | terminal-bad-status | 34913 + www.journal.csj.jp | no-pdf-link | 30881 + espace.library.uq.edu.au | redirect-loop | 24570 + www.jci.org | redirect-loop | 24409 + aip.scitation.org | wrong-mimetype | 22144 + www.vr-elibrary.de | no-pdf-link | 17436 + www.biorxiv.org | wrong-mimetype | 15524 + ajph.aphapublications.org | no-pdf-link | 15083 + zookeys.pensoft.net | redirect-loop | 14867 + dialnet.unirioja.es | redirect-loop | 14486 + asa.scitation.org | wrong-mimetype | 14261 + www.nrcresearchpress.com | no-pdf-link | 14254 + dl.acm.org | redirect-loop | 14223 + osf.io | redirect-loop | 14103 + www.oecd-ilibrary.org | redirect-loop | 12835 + journals.sagepub.com | redirect-loop | 12229 + iopscience.iop.org | redirect-loop | 11825 + (30 rows) + + -- top no-capture terminal domains + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + => very few from any domain, interesting. Guess many of these are URLs that have truely never been crawled + + -- top no-capture base domains + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ------------------------------+------------+-------- + academic.oup.com | no-capture | 429888 + www.nature.com | no-capture | 273825 + dergipark.org.tr | no-capture | 119847 + www.biodiversitylibrary.org | no-capture | 110220 + escholarship.org | no-capture | 106307 + onlinelibrary.wiley.com | no-capture | 89771 + journals.sagepub.com | no-capture | 79297 + www.cell.com | no-capture | 64242 + deepblue.lib.umich.edu | no-capture | 58080 + babel.hathitrust.org | no-capture | 52286 + hal.archives-ouvertes.fr | no-capture | 48549 + iopscience.iop.org | no-capture | 42591 + dash.harvard.edu | no-capture | 40767 + www.tandfonline.com | no-capture | 40638 + discovery.ucl.ac.uk | no-capture | 40633 + www.jstage.jst.go.jp | no-capture | 39780 + www.doiserbia.nb.rs | no-capture | 39261 + dspace.mit.edu | no-capture | 37703 + zookeys.pensoft.net | no-capture | 34562 + repositorio.unesp.br | no-capture | 34437 + ashpublications.org | no-capture | 34112 + www.cambridge.org | no-capture | 33959 + kclpure.kcl.ac.uk | no-capture | 31455 + society.kisti.re.kr | no-capture | 30427 + pure.mpg.de | no-capture | 27650 + download.atlantis-press.com | no-capture | 27253 + dialnet.unirioja.es | no-capture | 26886 + link.springer.com | no-capture | 26257 + www.valueinhealthjournal.com | no-capture | 24798 + dspace.library.uu.nl | no-capture | 23234 + (30 rows) + + -- top no-capture base domains + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ------------------------------+------------+-------- + academic.oup.com | no-capture | 429888 + www.nature.com | no-capture | 273825 + dergipark.org.tr | no-capture | 119847 + www.biodiversitylibrary.org | no-capture | 110220 + escholarship.org | no-capture | 106307 + onlinelibrary.wiley.com | no-capture | 89771 + journals.sagepub.com | no-capture | 79297 + www.cell.com | no-capture | 64242 + deepblue.lib.umich.edu | no-capture | 58080 + babel.hathitrust.org | no-capture | 52286 + hal.archives-ouvertes.fr | no-capture | 48549 + iopscience.iop.org | no-capture | 42591 + dash.harvard.edu | no-capture | 40767 + www.tandfonline.com | no-capture | 40638 + discovery.ucl.ac.uk | no-capture | 40633 + www.jstage.jst.go.jp | no-capture | 39780 + www.doiserbia.nb.rs | no-capture | 39261 + dspace.mit.edu | no-capture | 37703 + zookeys.pensoft.net | no-capture | 34562 + repositorio.unesp.br | no-capture | 34437 + ashpublications.org | no-capture | 34112 + www.cambridge.org | no-capture | 33959 + kclpure.kcl.ac.uk | no-capture | 31455 + society.kisti.re.kr | no-capture | 30427 + pure.mpg.de | no-capture | 27650 + download.atlantis-press.com | no-capture | 27253 + dialnet.unirioja.es | no-capture | 26886 + link.springer.com | no-capture | 26257 + www.valueinhealthjournal.com | no-capture | 24798 + dspace.library.uu.nl | no-capture | 23234 + (30 rows) + + -- how many ingest requests not crawled at all? + SELECT count(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status IS NULL; + => 0 + + -- "cookie absent" terminal pages, by domain + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + --------------------------------+----------------+-------- + journals.sagepub.com | no-pdf-link | 126295 + www.tandfonline.com | no-pdf-link | 116690 + pubs.acs.org | no-pdf-link | 94619 + www.ahajournals.org | no-pdf-link | 84016 + www.journal.csj.jp | no-pdf-link | 30881 + aip.scitation.org | wrong-mimetype | 22143 + www.vr-elibrary.de | no-pdf-link | 17436 + ajph.aphapublications.org | no-pdf-link | 15080 + asa.scitation.org | wrong-mimetype | 14261 + www.nrcresearchpress.com | no-pdf-link | 14253 + journals.ametsoc.org | no-pdf-link | 10500 + www.journals.uchicago.edu | no-pdf-link | 6917 + www.icevirtuallibrary.com | no-pdf-link | 6484 + www.journals.uchicago.edu | wrong-mimetype | 6191 + www.healthaffairs.org | no-pdf-link | 5732 + pubsonline.informs.org | no-pdf-link | 5672 + pinnacle-secure.allenpress.com | no-pdf-link | 5013 + www.worldscientific.com | no-pdf-link | 4560 + www.ajronline.org | wrong-mimetype | 4523 + ehp.niehs.nih.gov | no-pdf-link | 4514 + www.future-science.com | no-pdf-link | 4091 + pubs.acs.org | wrong-mimetype | 4015 + aip.scitation.org | no-pdf-link | 3916 + www.futuremedicine.com | no-pdf-link | 3821 + asa.scitation.org | no-pdf-link | 3644 + www.liebertpub.com | no-pdf-link | 3345 + physicstoday.scitation.org | no-pdf-link | 3005 + pubs.cif-ifc.org | no-pdf-link | 2761 + epubs.siam.org | wrong-mimetype | 2583 + www.ajronline.org | no-pdf-link | 2563 + (30 rows) + + -- "cookie absent" terminal pages, by domain + SELECT count(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'; + + => 654885 + + -- NOT "cookie absent" terminal page failures, total count + SELECT count(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'; + + => 1403837 + +Looks like these domains are almost all "cookieAbsent" blocking: +- journals.sagepub.com +- pubs.acs.org +- ahajournals.org +- www.journal.csj.jp +- aip.scitation.org + +Grab some individual URLs to test: + + SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent' + ORDER BY updated DESC + LIMIT 25; + +NOT cookieAbsent testing with regular ingest tool: +- iopscience.iop.org, terminal-bad-status, SPNv2 fetch, success +- academic.oup.com => silverchair, terminal-bad-status, SPNv2 fetch, succes +- osf.io success + + SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' + ORDER BY updated DESC + LIMIT 25; + +cookieAbsent testing with regular ingest tool: +- www.tandfonline.com failure (no-pdf-link via wayback), but force-recrawl works + +The main distinguisher is status. terminal-bad-status can be ingested (live) +successfully, while no-pdf-link, redirect-loop, etc need to be re-crawled. + +## Heritrix Plan + +Generate following ingest request batches: + +- no-capture status from unpaywall +- all other failures except /cookieAbsent +- /cookieAbsent failures + +Plan will be to crawl no-capture first (to completion), then try the other +non-/cookieAbsent failures. /cookieAbsent means we'll need to use SPNv2. + +Because there are so few "no-capture on second hop" cases, will not enqueue +both terminal urls and base urls, only base urls. + +Should definitely skip/filter: + +- www.ncbi.nlm.nih.gov + +## Ingest Request Export + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status = 'no-capture' + ) TO '/grande/snapshots/unpaywall_nocapture_20200304.rows.json'; + => 4,855,142 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent' + ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json'; + => 1,403,837 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200304.rows.json > unpaywall_nocapture_20200304.json + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json > unpaywall_fail_nocookie_20200304.json + +Note: will probably end up re-running the below after crawling+ingesting the above: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.status = 'terminal-bad-status' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' + ) TO '/grande/snapshots/unpaywall_fail_cookie_badstatus_20200304.rows.json'; + => 0 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.status != 'terminal-bad-status' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' + ) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json'; + => 654,885 + +## Batch Ingest + +Test small batch: + + head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full batch: + + cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + # there was a broken line in there, so... + # parse error: Expected separator between values at line 1367873, column 175 + # tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c > /dev/null + tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Note that the crawl is not entirely complete and not all CDX seem to have been +loaded, so may need to iterate. About 10% are still "no capture". May want or +need to additionally crawl the terminal URLs, not the base URLs. + +## Post-ingest stats + +Overall status: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 17354494 + no-pdf-link | 1471076 + no-capture | 1135992 + redirect-loop | 837842 + terminal-bad-status | 803081 + cdx-error | 219746 + wrong-mimetype | 100723 + link-loop | 16013 + wayback-error | 12448 + null-body | 9444 + redirects-exceeded | 600 + petabox-error | 411 + bad-redirect | 17 + bad-gzip-encoding | 4 + spn2-cdx-lookup-failure | 3 + gateway-timeout | 1 + spn2-error:job-failed | 1 + spn2-error | 1 + (18 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + -----------------------------------+---------------------+-------- + academic.oup.com | no-pdf-link | 330211 + watermark.silverchair.com | terminal-bad-status | 324599 + www.tandfonline.com | no-pdf-link | 242724 + journals.sagepub.com | no-pdf-link | 202050 + iopscience.iop.org | terminal-bad-status | 144063 + files-journal-api.frontiersin.org | terminal-bad-status | 121719 + pubs.acs.org | no-pdf-link | 104535 + www.ahajournals.org | no-pdf-link | 102653 + society.kisti.re.kr | no-pdf-link | 101787 + www.degruyter.com | redirect-loop | 95130 + www.nature.com | redirect-loop | 87534 + onlinelibrary.wiley.com | no-pdf-link | 84432 + www.cell.com | redirect-loop | 61496 + www.degruyter.com | terminal-bad-status | 42919 + babel.hathitrust.org | terminal-bad-status | 41813 + www.ncbi.nlm.nih.gov | redirect-loop | 40488 + scialert.net | no-pdf-link | 38341 + ashpublications.org | no-pdf-link | 34889 + dialnet.unirioja.es | terminal-bad-status | 32076 + www.journal.csj.jp | no-pdf-link | 30881 + pure.mpg.de | redirect-loop | 26163 + www.jci.org | redirect-loop | 24701 + espace.library.uq.edu.au | redirect-loop | 24591 + www.valueinhealthjournal.com | redirect-loop | 23740 + www.vr-elibrary.de | no-pdf-link | 23332 + aip.scitation.org | wrong-mimetype | 22144 + osf.io | redirect-loop | 18513 + www.journals.elsevier.com | no-pdf-link | 16710 + www.spandidos-publications.com | redirect-loop | 15711 + www.biorxiv.org | wrong-mimetype | 15513 + (30 rows) + +Dump lists for another iteration of bulk ingest: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status = 'no-capture' + ) TO '/grande/snapshots/unpaywall_nocapture_20200323.rows.json'; + => 278,876 + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent' + ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200323.rows.json'; + => + + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200323.rows.json > unpaywall_nocapture_20200323.json + + cat unpaywall_nocapture_20200323.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md deleted file mode 100644 index 428ce05..0000000 --- a/notes/ingest/2020-03-04_mag.md +++ /dev/null @@ -1,576 +0,0 @@ - -Rough plan: - -- run bulk and/or regular ingest requests for just those of AIT partners (200k?) -- persist ingest requests (22 million or so) -- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall) -- crawl those which are no-capture - - -## Generate Requests - -Newer version of `mag_ingest_request.sh` script requires venv with urlcanon -installed. - -Starting with the 2020-01-23 MAG dump, will generate a full ingest request set -(including DOI `ext_id` when available), with any dominant domains removed (eg, -arxiv.org): - - export LC_ALL=C - cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json - => previously 25.6M - => 25.6M 2:29:43 [2.85k/s] - - export LC_ALL=C - zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json - => 4.3M 0:25:45 [2.78k/s] - - export LC_ALL=C - cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id - - zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l - => 6,504,907 - - zcat PaperUrls_mag_url_pmid.txt.gz | wc -l - => 4,369,832 - - cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l - => previously 15,707,405 - => 15,702,581 - - cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l - => 0 - URL encoding seems to be working - -## Persist Ingest Requests - -First pmid ingest requests, then the all/doi file. The reason to do this order -is that the all/doi file will have some rows with no DOI (and thus no -`ext_id`), while the PMID file will not. - - # small sample - head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request - - Worker: Counter({'total': 10, 'skip-result-fields': 10}) - JSON lines pushed: Counter({'total': 10, 'pushed': 10}) - - cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request - - => 4.3M 0:16:46 [4.27k/s] - Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0}) - JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026}) - => hit a bug on first attempt, which is why total/insert results don't match - - cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request - - => 25.6M 2:21:54 [3.01k/s] - Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0}) - JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559}) - - -## Crawl/Dupe Status - - SELECT ingest_file_result.status, COUNT(*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag' - GROUP BY status - ORDER BY COUNT DESC - LIMIT 20; - -After just PMID links: - - status | count - ---------------------+--------- - | 3000115 - success | 1126881 - no-capture | 69459 - terminal-bad-status | 30259 - redirect-loop | 11656 - no-pdf-link | 2836 - wrong-mimetype | 1456 - link-loop | 1259 - wayback-error | 1232 - cdx-error | 932 - null-body | 85 - petabox-error | 50 - bad-redirect | 1 - (13 rows) - -After all links: - - SELECT COUNT(*) - FROM ingest_request - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag'; - => 25596563 - - - status | count - ---------------------+---------- - | 21130841 - success | 3915682 - no-capture | 391813 - terminal-bad-status | 76488 - redirect-loop | 44202 - wrong-mimetype | 16418 - no-pdf-link | 10995 - wayback-error | 3679 - cdx-error | 3414 - link-loop | 2098 - null-body | 709 - petabox-error | 221 - bad-gzip-encoding | 2 - bad-redirect | 1 - (14 rows) - -Somewhat more un-ingested than expected. - -Dump requests: - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag' - AND ingest_file_result.status IS NULL - ) TO '/grande/snapshots/mag_noingest_20200305.rows.json'; - => COPY 21,130,841 - -Transform and shuf: - - ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz - => 21.1M 0:18:57 [18.6k/s] - -## Bulk Ingest Partner Output - -These are subsets of the full list from potential AIT-S partners; want to run -these through the pipeline before the full batch. Duplication against the full -batch should be minimal. - -Size: - - bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l - 29007 - bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json - 34265 ingest_requests_mag-2020-01-23.cornell.json - -Test ingest: - - head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -Full ingests: - - cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -## Bulk Ingest - -Shard it into batches of roughly 1 million: - - cd /grande/snapshots/ - zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json - -Add a single batch like: - - cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - - partner ingests (see above) - => 2020-03-05 12:49: 118,396 - 1056543 mag_noingest_20200305.ingest_request.split_00.json - => 2020-03-05 14:34: 1,055,224 - => check on stats/ratios; filter by ingest update time? - 1056542 mag_noingest_20200305.ingest_request.split_01.json - 1056542 mag_noingest_20200305.ingest_request.split_02.json - 1056542 mag_noingest_20200305.ingest_request.split_03.json - 1056542 mag_noingest_20200305.ingest_request.split_04.json - 1056542 mag_noingest_20200305.ingest_request.split_05.json - 1056542 mag_noingest_20200305.ingest_request.split_06.json - 1056542 mag_noingest_20200305.ingest_request.split_07.json - 1056542 mag_noingest_20200305.ingest_request.split_08.json - 1056542 mag_noingest_20200305.ingest_request.split_09.json - => 2020-03-05 18:04: 10,009,297 - => 2020-03-06 16:53: 6,553,946 - 1056542 mag_noingest_20200305.ingest_request.split_10.json - 1056542 mag_noingest_20200305.ingest_request.split_11.json - 1056542 mag_noingest_20200305.ingest_request.split_12.json - 1056542 mag_noingest_20200305.ingest_request.split_13.json - 1056542 mag_noingest_20200305.ingest_request.split_14.json - 1056542 mag_noingest_20200305.ingest_request.split_15.json - 1056542 mag_noingest_20200305.ingest_request.split_16.json - 1056542 mag_noingest_20200305.ingest_request.split_17.json - 1056542 mag_noingest_20200305.ingest_request.split_18.json - 1056542 mag_noingest_20200305.ingest_request.split_19.json - => 2020-03-06 16:59: 17,001,032 - -Stats from bulk ingest: - - SELECT ingest_file_result.status, COUNT(*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag' - GROUP BY status - ORDER BY COUNT DESC - LIMIT 20; - - status | count - ---------------------+---------- - no-capture | 12237193 - success | 11991293 - no-pdf-link | 521691 - redirect-loop | 437192 - terminal-bad-status | 231181 - link-loop | 92633 - cdx-error | 33631 - wrong-mimetype | 28638 - wayback-error | 19651 - null-body | 2682 - petabox-error | 727 - | 47 - bad-redirect | 44 - bad-gzip-encoding | 7 - (14 rows) - -Failures by domain: - - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag' - ) t1 - WHERE t1.domain != '' - AND t1.status != 'success' - AND t1.status != 'no-capture' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - domain | status | count - --------------------------------------+---------------------+-------- - dialnet.unirioja.es | redirect-loop | 240967 - onlinelibrary.wiley.com | no-pdf-link | 147696 - agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639 - iopscience.iop.org | terminal-bad-status | 69591 - febs.onlinelibrary.wiley.com | no-pdf-link | 49874 - www.researchgate.net | redirect-loop | 42859 - journals.sagepub.com | no-pdf-link | 27448 - papers.ssrn.com | redirect-loop | 27328 - dialnet.unirioja.es | terminal-bad-status | 20320 - physoc.onlinelibrary.wiley.com | no-pdf-link | 20232 - science.sciencemag.org | link-loop | 17811 - espace.library.uq.edu.au | redirect-loop | 17185 - bpspubs.onlinelibrary.wiley.com | no-pdf-link | 15785 - obgyn.onlinelibrary.wiley.com | no-pdf-link | 15301 - anthrosource.onlinelibrary.wiley.com | no-pdf-link | 13746 - www.tandfonline.com | no-pdf-link | 13303 - aasldpubs.onlinelibrary.wiley.com | no-pdf-link | 11070 - link.springer.com | redirect-loop | 10594 - www.redalyc.org:9081 | no-pdf-link | 10515 - watermark.silverchair.com | terminal-bad-status | 9739 - www.bmj.com | link-loop | 9389 - www.repository.naturalis.nl | redirect-loop | 8213 - bjp.rcpsych.org | link-loop | 8045 - aslopubs.onlinelibrary.wiley.com | no-pdf-link | 7814 - nph.onlinelibrary.wiley.com | no-pdf-link | 7801 - iopscience.iop.org | redirect-loop | 7697 - journals.tubitak.gov.tr | wrong-mimetype | 7159 - www.biorxiv.org | wrong-mimetype | 7067 - www.erudit.org | redirect-loop | 6819 - besjournals.onlinelibrary.wiley.com | no-pdf-link | 6254 - (30 rows) - -Domains to follow-up (eg, sandcrawler ingest tests/tweaks): -- dialnet.unirioja.es | redirect-loop | 240967 -- www.researchgate.net | redirect-loop | 42859 -- www.redalyc.org:9081 | no-pdf-link | 10515 -- www.repository.naturalis.nl | redirect-loop | 8213 -- bjp.rcpsych.org | link-loop | 8045 -- journals.tubitak.gov.tr | wrong-mimetype | 7159 -- www.erudit.org | redirect-loop | 6819 - -The dialnet.unirioja.es ones may be worth re-crawling via heritrix? - -Top uncrawled domains: - - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag' - ) t1 - WHERE t1.domain != '' - AND t1.status = 'no-capture' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - domain | status | count - ---------------------------------+------------+-------- - ieeexplore.ieee.org | no-capture | 957835 - link.springer.com | no-capture | 394121 - www.researchgate.net | no-capture | 376974 - cyberleninka.ru | no-capture | 376012 - iopscience.iop.org | no-capture | 348791 - papers.ssrn.com | no-capture | 286860 - dergipark.org.tr | no-capture | 217556 - dialnet.unirioja.es | no-capture | 214398 - academic.oup.com | no-capture | 212364 - www.tandfonline.com | no-capture | 148940 - journals.sagepub.com | no-capture | 144695 - www.papersearch.net | no-capture | 138986 - absimage.aps.org | no-capture | 111976 - apps.dtic.mil | no-capture | 106984 - www.cambridge.org | no-capture | 97533 - www.bmj.com | no-capture | 92437 - bioone.org | no-capture | 87573 - science.sciencemag.org | no-capture | 75723 - shodhganga.inflibnet.ac.in:8080 | no-capture | 75395 - www.jstor.org | no-capture | 73230 - works.bepress.com | no-capture | 68747 - www.scielo.org.co | no-capture | 59650 - hrcak.srce.hr | no-capture | 59332 - muse.jhu.edu | no-capture | 57828 - onlinelibrary.wiley.com | no-capture | 55621 - www.jbc.org | no-capture | 54608 - www.jstage.jst.go.jp | no-capture | 53631 - www.redalyc.org | no-capture | 50406 - lup.lub.lu.se | no-capture | 47469 - www.dtic.mil | no-capture | 41820 - (30 rows) - -## Heritrix Seedlist Generation - -Dump ingest requests (filtered for some domains that don't expect to crawl via -heritrix): - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag' - AND ingest_file_result.status = 'no-capture' - AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' - AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' - AND ingest_request.base_url NOT LIKE '%ahajournals.org%' - AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' - AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' - AND ingest_request.base_url NOT LIKE '%academic.oup.com%' - AND ingest_request.base_url NOT LIKE '%tandfonline.com%' - ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json'; - => COPY 11714199 - - # in sandcrawler pipenv - ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json - -## Bulk Ingest of Heritrix Content - -Small sample: - - head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -Full run: - - cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - - 2020-04-07 12:19 (pacific): 11,703,871 - -## Post-bulk-ingest - -Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need -to re-try things like cdx-error. - -Current status: - - status | count - -------------------------------+---------- - success | 18491799 - redirect-loop | 1968530 - no-capture | 1373657 - no-pdf-link | 1311842 - link-loop | 1296439 - terminal-bad-status | 627577 - cdx-error | 418278 - wrong-mimetype | 50141 - wayback-error | 37159 - petabox-error | 11249 - null-body | 6295 - gateway-timeout | 3051 - spn2-cdx-lookup-failure | 328 - spn2-error:invalid-url-syntax | 93 - bad-redirect | 75 - | 47 - invalid-host-resolution | 28 - spn2-error | 10 - bad-gzip-encoding | 7 - redirects-exceeded | 2 - (20 rows) - -Lots of cdx-error to retry. - -The no-capture links are probably a mix of domain-blocklist and things that -failed in bulk mode. Will dump and re-attempt them: - - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag' - AND ingest_file_result.status = 'no-capture' - AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' - AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' - AND ingest_request.base_url NOT LIKE '%ahajournals.org%' - AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' - AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' - AND ingest_request.base_url NOT LIKE '%academic.oup.com%' - AND ingest_request.base_url NOT LIKE '%tandfonline.com%' - ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json'; - => 859849 - -What domains are these? - - cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30 - -Let's filter down more: - - cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json - - wc -l mag_nocapture_20200420.rows.filtered.json - 423085 mag_nocapture_20200420.rows.filtered.json - -Ok, enqueue! - - cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 - -## Final Stats - -... for this round of ingest: - - SELECT ingest_file_result.status, COUNT(*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag' - GROUP BY status - ORDER BY COUNT DESC - LIMIT 20; - - - status | count - -------------------------------------+---------- - success | 18712849 - redirect-loop | 2008110 - no-pdf-link | 1337012 - link-loop | 1326761 - no-capture | 1030693 - terminal-bad-status | 637143 - gateway-timeout | 193194 - cdx-error | 125907 - spn2-cdx-lookup-failure | 77842 - wrong-mimetype | 50882 - wayback-error | 40278 - invalid-host-resolution | 35201 - petabox-error | 11254 - null-body | 6485 - spn2-error | 1643 - spn2-error:job-failed | 747 - spn2-error:invalid-url-syntax | 325 - spn2-error:soft-time-limit-exceeded | 190 - bad-redirect | 77 - | 47 - (20 rows) - -Failures by domain: - - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_file_result.ingest_type = 'pdf' - AND ingest_request.link_source = 'mag' - ) t1 - WHERE t1.domain != '' - AND t1.status != 'success' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - - domain | status | count - ---------------------------------+---------------------+-------- - ieeexplore.ieee.org | redirect-loop | 677712 - cyberleninka.ru | link-loop | 308390 - papers.ssrn.com | link-loop | 281804 - ieeexplore.ieee.org | link-loop | 273559 - dialnet.unirioja.es | redirect-loop | 240504 - dialnet.unirioja.es | terminal-bad-status | 232481 - onlinelibrary.wiley.com | no-pdf-link | 220932 - iopscience.iop.org | terminal-bad-status | 172480 - validate.perfdrive.com | no-pdf-link | 172312 - link.springer.com | redirect-loop | 130398 - agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382 - iopscience.iop.org | redirect-loop | 105234 - www.bmj.com | link-loop | 100354 - www.researchgate.net | redirect-loop | 84366 - www.cambridge.org | link-loop | 83171 - jamanetwork.com | no-pdf-link | 75053 - febs.onlinelibrary.wiley.com | no-pdf-link | 74872 - www.jstor.org | redirect-loop | 72059 - journals.sagepub.com | no-pdf-link | 63028 - science.sciencemag.org | redirect-loop | 62927 - profile.thieme.de | no-pdf-link | 62406 - cyberleninka.ru | redirect-loop | 56733 - link.springer.com | link-loop | 47608 - physoc.onlinelibrary.wiley.com | no-pdf-link | 30180 - science.sciencemag.org | link-loop | 29908 - papers.ssrn.com | redirect-loop | 27255 - obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789 - www.computer.org | no-pdf-link | 26444 - watermark.silverchair.com | terminal-bad-status | 25934 - www.nature.com | redirect-loop | 25306 - (30 rows) diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md new file mode 100644 index 0000000..73396bd --- /dev/null +++ b/notes/ingest/2020-03-oa_but_not_marked.md @@ -0,0 +1,25 @@ + +These are large journals with a high fraction of "in IA", but not marked as OA +so not crawling regularly. + +TODO: add things like list of unpaywall ISSN / OA status to try and find more +"practical" / bronze OA + +## First Run + +https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him +https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4 +https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4 +https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e +https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm +https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe + +## TODO + +https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible) +https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?) + +https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link? +https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA? +https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken? +https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop diff --git a/notes/ingest/2020-03_mag.md b/notes/ingest/2020-03_mag.md new file mode 100644 index 0000000..428ce05 --- /dev/null +++ b/notes/ingest/2020-03_mag.md @@ -0,0 +1,576 @@ + +Rough plan: + +- run bulk and/or regular ingest requests for just those of AIT partners (200k?) +- persist ingest requests (22 million or so) +- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall) +- crawl those which are no-capture + + +## Generate Requests + +Newer version of `mag_ingest_request.sh` script requires venv with urlcanon +installed. + +Starting with the 2020-01-23 MAG dump, will generate a full ingest request set +(including DOI `ext_id` when available), with any dominant domains removed (eg, +arxiv.org): + + export LC_ALL=C + cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json + => previously 25.6M + => 25.6M 2:29:43 [2.85k/s] + + export LC_ALL=C + zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json + => 4.3M 0:25:45 [2.78k/s] + + export LC_ALL=C + cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id + + zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l + => 6,504,907 + + zcat PaperUrls_mag_url_pmid.txt.gz | wc -l + => 4,369,832 + + cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l + => previously 15,707,405 + => 15,702,581 + + cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l + => 0 + URL encoding seems to be working + +## Persist Ingest Requests + +First pmid ingest requests, then the all/doi file. The reason to do this order +is that the all/doi file will have some rows with no DOI (and thus no +`ext_id`), while the PMID file will not. + + # small sample + head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request - + Worker: Counter({'total': 10, 'skip-result-fields': 10}) + JSON lines pushed: Counter({'total': 10, 'pushed': 10}) + + cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request - + => 4.3M 0:16:46 [4.27k/s] + Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0}) + JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026}) + => hit a bug on first attempt, which is why total/insert results don't match + + cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request - + => 25.6M 2:21:54 [3.01k/s] + Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0}) + JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559}) + + +## Crawl/Dupe Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + +After just PMID links: + + status | count + ---------------------+--------- + | 3000115 + success | 1126881 + no-capture | 69459 + terminal-bad-status | 30259 + redirect-loop | 11656 + no-pdf-link | 2836 + wrong-mimetype | 1456 + link-loop | 1259 + wayback-error | 1232 + cdx-error | 932 + null-body | 85 + petabox-error | 50 + bad-redirect | 1 + (13 rows) + +After all links: + + SELECT COUNT(*) + FROM ingest_request + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag'; + => 25596563 + + + status | count + ---------------------+---------- + | 21130841 + success | 3915682 + no-capture | 391813 + terminal-bad-status | 76488 + redirect-loop | 44202 + wrong-mimetype | 16418 + no-pdf-link | 10995 + wayback-error | 3679 + cdx-error | 3414 + link-loop | 2098 + null-body | 709 + petabox-error | 221 + bad-gzip-encoding | 2 + bad-redirect | 1 + (14 rows) + +Somewhat more un-ingested than expected. + +Dump requests: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/mag_noingest_20200305.rows.json'; + => COPY 21,130,841 + +Transform and shuf: + + ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz + => 21.1M 0:18:57 [18.6k/s] + +## Bulk Ingest Partner Output + +These are subsets of the full list from potential AIT-S partners; want to run +these through the pipeline before the full batch. Duplication against the full +batch should be minimal. + +Size: + + bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l + 29007 + bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json + 34265 ingest_requests_mag-2020-01-23.cornell.json + +Test ingest: + + head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full ingests: + + cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Bulk Ingest + +Shard it into batches of roughly 1 million: + + cd /grande/snapshots/ + zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json + +Add a single batch like: + + cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + partner ingests (see above) + => 2020-03-05 12:49: 118,396 + 1056543 mag_noingest_20200305.ingest_request.split_00.json + => 2020-03-05 14:34: 1,055,224 + => check on stats/ratios; filter by ingest update time? + 1056542 mag_noingest_20200305.ingest_request.split_01.json + 1056542 mag_noingest_20200305.ingest_request.split_02.json + 1056542 mag_noingest_20200305.ingest_request.split_03.json + 1056542 mag_noingest_20200305.ingest_request.split_04.json + 1056542 mag_noingest_20200305.ingest_request.split_05.json + 1056542 mag_noingest_20200305.ingest_request.split_06.json + 1056542 mag_noingest_20200305.ingest_request.split_07.json + 1056542 mag_noingest_20200305.ingest_request.split_08.json + 1056542 mag_noingest_20200305.ingest_request.split_09.json + => 2020-03-05 18:04: 10,009,297 + => 2020-03-06 16:53: 6,553,946 + 1056542 mag_noingest_20200305.ingest_request.split_10.json + 1056542 mag_noingest_20200305.ingest_request.split_11.json + 1056542 mag_noingest_20200305.ingest_request.split_12.json + 1056542 mag_noingest_20200305.ingest_request.split_13.json + 1056542 mag_noingest_20200305.ingest_request.split_14.json + 1056542 mag_noingest_20200305.ingest_request.split_15.json + 1056542 mag_noingest_20200305.ingest_request.split_16.json + 1056542 mag_noingest_20200305.ingest_request.split_17.json + 1056542 mag_noingest_20200305.ingest_request.split_18.json + 1056542 mag_noingest_20200305.ingest_request.split_19.json + => 2020-03-06 16:59: 17,001,032 + +Stats from bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + ---------------------+---------- + no-capture | 12237193 + success | 11991293 + no-pdf-link | 521691 + redirect-loop | 437192 + terminal-bad-status | 231181 + link-loop | 92633 + cdx-error | 33631 + wrong-mimetype | 28638 + wayback-error | 19651 + null-body | 2682 + petabox-error | 727 + | 47 + bad-redirect | 44 + bad-gzip-encoding | 7 + (14 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + --------------------------------------+---------------------+-------- + dialnet.unirioja.es | redirect-loop | 240967 + onlinelibrary.wiley.com | no-pdf-link | 147696 + agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639 + iopscience.iop.org | terminal-bad-status | 69591 + febs.onlinelibrary.wiley.com | no-pdf-link | 49874 + www.researchgate.net | redirect-loop | 42859 + journals.sagepub.com | no-pdf-link | 27448 + papers.ssrn.com | redirect-loop | 27328 + dialnet.unirioja.es | terminal-bad-status | 20320 + physoc.onlinelibrary.wiley.com | no-pdf-link | 20232 + science.sciencemag.org | link-loop | 17811 + espace.library.uq.edu.au | redirect-loop | 17185 + bpspubs.onlinelibrary.wiley.com | no-pdf-link | 15785 + obgyn.onlinelibrary.wiley.com | no-pdf-link | 15301 + anthrosource.onlinelibrary.wiley.com | no-pdf-link | 13746 + www.tandfonline.com | no-pdf-link | 13303 + aasldpubs.onlinelibrary.wiley.com | no-pdf-link | 11070 + link.springer.com | redirect-loop | 10594 + www.redalyc.org:9081 | no-pdf-link | 10515 + watermark.silverchair.com | terminal-bad-status | 9739 + www.bmj.com | link-loop | 9389 + www.repository.naturalis.nl | redirect-loop | 8213 + bjp.rcpsych.org | link-loop | 8045 + aslopubs.onlinelibrary.wiley.com | no-pdf-link | 7814 + nph.onlinelibrary.wiley.com | no-pdf-link | 7801 + iopscience.iop.org | redirect-loop | 7697 + journals.tubitak.gov.tr | wrong-mimetype | 7159 + www.biorxiv.org | wrong-mimetype | 7067 + www.erudit.org | redirect-loop | 6819 + besjournals.onlinelibrary.wiley.com | no-pdf-link | 6254 + (30 rows) + +Domains to follow-up (eg, sandcrawler ingest tests/tweaks): +- dialnet.unirioja.es | redirect-loop | 240967 +- www.researchgate.net | redirect-loop | 42859 +- www.redalyc.org:9081 | no-pdf-link | 10515 +- www.repository.naturalis.nl | redirect-loop | 8213 +- bjp.rcpsych.org | link-loop | 8045 +- journals.tubitak.gov.tr | wrong-mimetype | 7159 +- www.erudit.org | redirect-loop | 6819 + +The dialnet.unirioja.es ones may be worth re-crawling via heritrix? + +Top uncrawled domains: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ---------------------------------+------------+-------- + ieeexplore.ieee.org | no-capture | 957835 + link.springer.com | no-capture | 394121 + www.researchgate.net | no-capture | 376974 + cyberleninka.ru | no-capture | 376012 + iopscience.iop.org | no-capture | 348791 + papers.ssrn.com | no-capture | 286860 + dergipark.org.tr | no-capture | 217556 + dialnet.unirioja.es | no-capture | 214398 + academic.oup.com | no-capture | 212364 + www.tandfonline.com | no-capture | 148940 + journals.sagepub.com | no-capture | 144695 + www.papersearch.net | no-capture | 138986 + absimage.aps.org | no-capture | 111976 + apps.dtic.mil | no-capture | 106984 + www.cambridge.org | no-capture | 97533 + www.bmj.com | no-capture | 92437 + bioone.org | no-capture | 87573 + science.sciencemag.org | no-capture | 75723 + shodhganga.inflibnet.ac.in:8080 | no-capture | 75395 + www.jstor.org | no-capture | 73230 + works.bepress.com | no-capture | 68747 + www.scielo.org.co | no-capture | 59650 + hrcak.srce.hr | no-capture | 59332 + muse.jhu.edu | no-capture | 57828 + onlinelibrary.wiley.com | no-capture | 55621 + www.jbc.org | no-capture | 54608 + www.jstage.jst.go.jp | no-capture | 53631 + www.redalyc.org | no-capture | 50406 + lup.lub.lu.se | no-capture | 47469 + www.dtic.mil | no-capture | 41820 + (30 rows) + +## Heritrix Seedlist Generation + +Dump ingest requests (filtered for some domains that don't expect to crawl via +heritrix): + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json'; + => COPY 11714199 + + # in sandcrawler pipenv + ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json + +## Bulk Ingest of Heritrix Content + +Small sample: + + head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full run: + + cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + 2020-04-07 12:19 (pacific): 11,703,871 + +## Post-bulk-ingest + +Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need +to re-try things like cdx-error. + +Current status: + + status | count + -------------------------------+---------- + success | 18491799 + redirect-loop | 1968530 + no-capture | 1373657 + no-pdf-link | 1311842 + link-loop | 1296439 + terminal-bad-status | 627577 + cdx-error | 418278 + wrong-mimetype | 50141 + wayback-error | 37159 + petabox-error | 11249 + null-body | 6295 + gateway-timeout | 3051 + spn2-cdx-lookup-failure | 328 + spn2-error:invalid-url-syntax | 93 + bad-redirect | 75 + | 47 + invalid-host-resolution | 28 + spn2-error | 10 + bad-gzip-encoding | 7 + redirects-exceeded | 2 + (20 rows) + +Lots of cdx-error to retry. + +The no-capture links are probably a mix of domain-blocklist and things that +failed in bulk mode. Will dump and re-attempt them: + + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json'; + => 859849 + +What domains are these? + + cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30 + +Let's filter down more: + + cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json + + wc -l mag_nocapture_20200420.rows.filtered.json + 423085 mag_nocapture_20200420.rows.filtered.json + +Ok, enqueue! + + cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## Final Stats + +... for this round of ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------------------+---------- + success | 18712849 + redirect-loop | 2008110 + no-pdf-link | 1337012 + link-loop | 1326761 + no-capture | 1030693 + terminal-bad-status | 637143 + gateway-timeout | 193194 + cdx-error | 125907 + spn2-cdx-lookup-failure | 77842 + wrong-mimetype | 50882 + wayback-error | 40278 + invalid-host-resolution | 35201 + petabox-error | 11254 + null-body | 6485 + spn2-error | 1643 + spn2-error:job-failed | 747 + spn2-error:invalid-url-syntax | 325 + spn2-error:soft-time-limit-exceeded | 190 + bad-redirect | 77 + | 47 + (20 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + + domain | status | count + ---------------------------------+---------------------+-------- + ieeexplore.ieee.org | redirect-loop | 677712 + cyberleninka.ru | link-loop | 308390 + papers.ssrn.com | link-loop | 281804 + ieeexplore.ieee.org | link-loop | 273559 + dialnet.unirioja.es | redirect-loop | 240504 + dialnet.unirioja.es | terminal-bad-status | 232481 + onlinelibrary.wiley.com | no-pdf-link | 220932 + iopscience.iop.org | terminal-bad-status | 172480 + validate.perfdrive.com | no-pdf-link | 172312 + link.springer.com | redirect-loop | 130398 + agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382 + iopscience.iop.org | redirect-loop | 105234 + www.bmj.com | link-loop | 100354 + www.researchgate.net | redirect-loop | 84366 + www.cambridge.org | link-loop | 83171 + jamanetwork.com | no-pdf-link | 75053 + febs.onlinelibrary.wiley.com | no-pdf-link | 74872 + www.jstor.org | redirect-loop | 72059 + journals.sagepub.com | no-pdf-link | 63028 + science.sciencemag.org | redirect-loop | 62927 + profile.thieme.de | no-pdf-link | 62406 + cyberleninka.ru | redirect-loop | 56733 + link.springer.com | link-loop | 47608 + physoc.onlinelibrary.wiley.com | no-pdf-link | 30180 + science.sciencemag.org | link-loop | 29908 + papers.ssrn.com | redirect-loop | 27255 + obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789 + www.computer.org | no-pdf-link | 26444 + watermark.silverchair.com | terminal-bad-status | 25934 + www.nature.com | redirect-loop | 25306 + (30 rows) diff --git a/notes/ingest/2020-03_s2.md b/notes/ingest/2020-03_s2.md new file mode 100644 index 0000000..fedaba0 --- /dev/null +++ b/notes/ingest/2020-03_s2.md @@ -0,0 +1,35 @@ + +Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these +ingested, as well as any previous existing content. + +Also, there are a bunch of PDF outlinks to the web; should do S2-specific +matching and ingest of those. + +There are a few categories of paper from pdfs.s.o: + +1. we had previous GWB crawl, didn't re-crawl +2. we had PDF from elsewhere on the web, didn't re-crawl +3. crawled successfully +4. crawl failed + +In this ingest, want to get all of categories 1 and 3. Could try to do this by +dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl), +and join that against the ingest request list. + +For other random web URLs, can do the usual persist/backfill/recrawl pipeline. + +## Create Seedlist + + zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz + zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz + + zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list + zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list + + zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz + zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz + + zcat s2_external_ingestrequest.json.gz | wc -l + 41201427 + zcat s2_hosted_ingestrequest.json.gz | wc -l + 23345761 diff --git a/notes/ingest/2020-03_s2_ingest.md b/notes/ingest/2020-03_s2_ingest.md deleted file mode 100644 index fedaba0..0000000 --- a/notes/ingest/2020-03_s2_ingest.md +++ /dev/null @@ -1,35 +0,0 @@ - -Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these -ingested, as well as any previous existing content. - -Also, there are a bunch of PDF outlinks to the web; should do S2-specific -matching and ingest of those. - -There are a few categories of paper from pdfs.s.o: - -1. we had previous GWB crawl, didn't re-crawl -2. we had PDF from elsewhere on the web, didn't re-crawl -3. crawled successfully -4. crawl failed - -In this ingest, want to get all of categories 1 and 3. Could try to do this by -dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl), -and join that against the ingest request list. - -For other random web URLs, can do the usual persist/backfill/recrawl pipeline. - -## Create Seedlist - - zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz - zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz - - zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list - zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list - - zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz - zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz - - zcat s2_external_ingestrequest.json.gz | wc -l - 41201427 - zcat s2_hosted_ingestrequest.json.gz | wc -l - 23345761 diff --git a/notes/ingest/2020-04-07_datacite.md b/notes/ingest/2020-04-07_datacite.md deleted file mode 100644 index 0fc7e67..0000000 --- a/notes/ingest/2020-04-07_datacite.md +++ /dev/null @@ -1,121 +0,0 @@ - -After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many -of the DOIs are for, eg, datasets, and don't want to waste time on those. - -Instead of using full ingest request file from the crawl, will generate a new -ingest request file using `fatcat_ingest.py` and set that up for bulk crawling. - -## Generate Requests - - ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json - => Expecting 8905453 release objects in search queries - => 8.91M 11:49:50 [ 209 /s] - => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453}) - -## Bulk Ingest - - cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -## Ingest Stats - -Note that this will have a small fraction of non-datacite results mixed in (eg, -from COVID-19 targeted crawls): - - SELECT ingest_file_result.status, COUNT(*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'doi' - AND ingest_request.ingest_type = 'pdf' - AND ingest_request.ingest_request_source = 'fatcat-ingest' - AND created >= '2020-04-07' - GROUP BY status - ORDER BY COUNT DESC - LIMIT 20; - - status | count - -------------------------------------+--------- - no-pdf-link | 4646767 - redirect-loop | 1447229 - no-capture | 860235 - success | 849501 - terminal-bad-status | 174869 - cdx-error | 159805 - wayback-error | 18076 - wrong-mimetype | 11169 - link-loop | 8410 - gateway-timeout | 4034 - spn2-cdx-lookup-failure | 510 - petabox-error | 339 - null-body | 251 - spn2-error | 19 - spn2-error:job-failed | 14 - bad-gzip-encoding | 13 - timeout | 5 - spn2-error:soft-time-limit-exceeded | 4 - invalid-host-resolution | 2 - spn2-error:pending | 1 - (20 rows) - -Top domains/statuses (including success): - - SELECT domain, status, COUNT((domain, status)) - FROM ( - SELECT - ingest_file_result.ingest_type, - ingest_file_result.status, - substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain - FROM ingest_file_result - LEFT JOIN ingest_request - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'doi' - AND ingest_request.ingest_type = 'pdf' - AND ingest_request.ingest_request_source = 'fatcat-ingest' - AND created >= '2020-04-07' - ) t1 - WHERE t1.domain != '' - AND t1.status != 'success' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 30; - - domain | status | count - ---------------------------------------+---------------------+-------- - ssl.fao.org | no-pdf-link | 862277 - www.e-periodica.ch | no-pdf-link | 746781 - www.researchgate.net | redirect-loop | 664524 - dlc.library.columbia.edu | no-pdf-link | 493111 - www.die-bonn.de | redirect-loop | 352903 - figshare.com | no-pdf-link | 319709 - statisticaldatasets.data-planet.com | no-pdf-link | 309584 - catalog.paradisec.org.au | redirect-loop | 225396 - zenodo.org | no-capture | 193201 - digi.ub.uni-heidelberg.de | no-pdf-link | 184974 - open.library.ubc.ca | no-pdf-link | 167841 - zenodo.org | no-pdf-link | 130617 - www.google.com | no-pdf-link | 111312 - www.e-manuscripta.ch | no-pdf-link | 79192 - ds.iris.edu | no-pdf-link | 77649 - data.inra.fr | no-pdf-link | 69440 - www.tib.eu | no-pdf-link | 63872 - www.egms.de | redirect-loop | 53877 - archaeologydataservice.ac.uk | redirect-loop | 52838 - d.lib.msu.edu | no-pdf-link | 45297 - www.e-rara.ch | no-pdf-link | 45163 - springernature.figshare.com | no-pdf-link | 42527 - boris.unibe.ch | no-pdf-link | 40816 - www.research-collection.ethz.ch | no-capture | 40350 - spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 33059 - repository.dri.ie | terminal-bad-status | 32760 - othes.univie.ac.at | no-pdf-link | 32558 - repositories.lib.utexas.edu | no-capture | 31526 - posterng.netkey.at | no-pdf-link | 30315 - zenodo.org | terminal-bad-status | 29614 - (30 rows) - diff --git a/notes/ingest/2020-04-07_unpaywall.md b/notes/ingest/2020-04-07_unpaywall.md deleted file mode 100644 index e30d482..0000000 --- a/notes/ingest/2020-04-07_unpaywall.md +++ /dev/null @@ -1,63 +0,0 @@ - -A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but -not released for more than a month). - -Primary goal is: - -- generate ingest requests for only *new* URLs -- bulk ingest these new URLs -- crawl any no-capture URLs from that batch -- re-bulk-ingest the no-capture batch -- analytics on failed ingests. eg, any particular domains that are failing to crawl - -This ingest pipeline was started on 2020-04-07 by bnewbold. - -## Transform and Load - - # in sandcrawler pipenv on aitio - zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json - => 24.7M 5:17:03 [ 1.3k/s] - - cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request - - => 24.7M - => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0}) - -## Dump new URLs and Bulk Ingest - - COPY ( - SELECT row_to_json(ingest_request.*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND date(ingest_request.created) > '2020-04-01' - AND ingest_file_result.status IS NULL - ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json'; - => 3696189 - - cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -## Dump no-capture - - COPY ( - SELECT row_to_json(ingest_request.*) - FROM ingest_request - LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url - WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'unpaywall' - AND date(ingest_request.created) > '2020-04-01' - AND ingest_file_result.status = 'no-capture' - AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' - AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' - AND ingest_request.base_url NOT LIKE '%ahajournals.org%' - AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' - AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' - AND ingest_request.base_url NOT LIKE '%academic.oup.com%' - AND ingest_request.base_url NOT LIKE '%tandfonline.com%' - ) TO '/grande/snapshots/unpaywall_nocapture_2020-04-XX.rows.json'; diff --git a/notes/ingest/2020-04_datacite.md b/notes/ingest/2020-04_datacite.md new file mode 100644 index 0000000..0fc7e67 --- /dev/null +++ b/notes/ingest/2020-04_datacite.md @@ -0,0 +1,121 @@ + +After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many +of the DOIs are for, eg, datasets, and don't want to waste time on those. + +Instead of using full ingest request file from the crawl, will generate a new +ingest request file using `fatcat_ingest.py` and set that up for bulk crawling. + +## Generate Requests + + ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json + => Expecting 8905453 release objects in search queries + => 8.91M 11:49:50 [ 209 /s] + => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453}) + +## Bulk Ingest + + cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Ingest Stats + +Note that this will have a small fraction of non-datacite results mixed in (eg, +from COVID-19 targeted crawls): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND created >= '2020-04-07' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+--------- + no-pdf-link | 4646767 + redirect-loop | 1447229 + no-capture | 860235 + success | 849501 + terminal-bad-status | 174869 + cdx-error | 159805 + wayback-error | 18076 + wrong-mimetype | 11169 + link-loop | 8410 + gateway-timeout | 4034 + spn2-cdx-lookup-failure | 510 + petabox-error | 339 + null-body | 251 + spn2-error | 19 + spn2-error:job-failed | 14 + bad-gzip-encoding | 13 + timeout | 5 + spn2-error:soft-time-limit-exceeded | 4 + invalid-host-resolution | 2 + spn2-error:pending | 1 + (20 rows) + +Top domains/statuses (including success): + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND created >= '2020-04-07' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ---------------------------------------+---------------------+-------- + ssl.fao.org | no-pdf-link | 862277 + www.e-periodica.ch | no-pdf-link | 746781 + www.researchgate.net | redirect-loop | 664524 + dlc.library.columbia.edu | no-pdf-link | 493111 + www.die-bonn.de | redirect-loop | 352903 + figshare.com | no-pdf-link | 319709 + statisticaldatasets.data-planet.com | no-pdf-link | 309584 + catalog.paradisec.org.au | redirect-loop | 225396 + zenodo.org | no-capture | 193201 + digi.ub.uni-heidelberg.de | no-pdf-link | 184974 + open.library.ubc.ca | no-pdf-link | 167841 + zenodo.org | no-pdf-link | 130617 + www.google.com | no-pdf-link | 111312 + www.e-manuscripta.ch | no-pdf-link | 79192 + ds.iris.edu | no-pdf-link | 77649 + data.inra.fr | no-pdf-link | 69440 + www.tib.eu | no-pdf-link | 63872 + www.egms.de | redirect-loop | 53877 + archaeologydataservice.ac.uk | redirect-loop | 52838 + d.lib.msu.edu | no-pdf-link | 45297 + www.e-rara.ch | no-pdf-link | 45163 + springernature.figshare.com | no-pdf-link | 42527 + boris.unibe.ch | no-pdf-link | 40816 + www.research-collection.ethz.ch | no-capture | 40350 + spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 33059 + repository.dri.ie | terminal-bad-status | 32760 + othes.univie.ac.at | no-pdf-link | 32558 + repositories.lib.utexas.edu | no-capture | 31526 + posterng.netkey.at | no-pdf-link | 30315 + zenodo.org | terminal-bad-status | 29614 + (30 rows) + diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md new file mode 100644 index 0000000..bce757b --- /dev/null +++ b/notes/ingest/2020-04_unpaywall.md @@ -0,0 +1,129 @@ + +A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but +not released for more than a month). + +Primary goal is: + +- generate ingest requests for only *new* URLs +- bulk ingest these new URLs +- crawl any no-capture URLs from that batch +- re-bulk-ingest the no-capture batch +- analytics on failed ingests. eg, any particular domains that are failing to crawl + +This ingest pipeline was started on 2020-04-07 by bnewbold. + +Ran through the first two steps again on 2020-05-03 after unpaywall had +released another dump (dated 2020-04-27). + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json + => 24.7M 5:17:03 [ 1.3k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => 24.7M + => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0}) + +Second time: + + # in sandcrawler pipenv on aitio + zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json + => 25.2M 3:16:28 [2.14k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0}) + => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390}) + + +## Dump new URLs and Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json'; + => 3696189 + + cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Second time: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-05-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json'; + => 1799760 + + cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Dump no-capture, Run Crawl + +Make two ingest request dumps: one with "all" URLs, which we will have heritrix +attempt to crawl, and then one with certain domains filtered out, which we may +or may not bother trying to ingest (due to expectation of failure). + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status = 'no-capture' + ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json'; + => 2734145 + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json'; + => 2602408 + +Not actually a very significant size difference after all. + +See `journal-crawls` repo for details on seedlist generation and crawling. + +## Re-Ingest Post-Crawl + +Test small batch: + + zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Run the whole batch: + + zcat /grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md new file mode 100644 index 0000000..4cfd8d5 --- /dev/null +++ b/notes/ingest/2020-05_oai_pmh.md @@ -0,0 +1,125 @@ + +Primary Goal: start large crawl of OAI landing pages that we haven't seen + +Fields of interest for ingest: +- oai identifer +- doi +- formats +- urls (maybe also "relations") +- types (type+stage) + +## Other Tasks + +About 150 million total lines. + +Types coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt + +Dump all ISSNs, with counts, quick check how many are in chocula/fatcat + + zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt + +Language coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt + +Format coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt + => 150M 0:56:14 [44.7k/s] + +Have a DOI? + + zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l + => 16,013,503 + + zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt + => 11,940,950 + +## Transform, Load, Bulk Ingest + + zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz + => 80M 6:36:55 [3.36k/s] + + time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request - + => 80M 4:00:21 [5.55k/s] + => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0}) + => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963}) + + => real 240m21.207s + => user 85m12.576s + => sys 3m29.580s + + select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai'; + => 51,185,088 + +Why so many (30 million) skipped? Not unique? + + zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l + => 51,185,088 + + zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt + wc -l request_url.txt + => 50,002,674 request_url.txt + + zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt + wc -l requires_oai.txt + => 34,622,083 requires_oai.txt + +Yup, tons of duplication. And remember this is exact URL, not SURT or similar. + +How many of these are URLs we have seen and ingested already? + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + | 49491452 + success | 1469113 + no-capture | 134611 + redirect-loop | 59666 + no-pdf-link | 8947 + cdx-error | 7561 + terminal-bad-status | 6704 + null-body | 5042 + wrong-mimetype | 879 + wayback-error | 722 + petabox-error | 198 + gateway-timeout | 86 + link-loop | 51 + invalid-host-resolution | 24 + spn2-cdx-lookup-failure | 22 + spn2-error | 4 + bad-gzip-encoding | 4 + spn2-error:job-failed | 2 + (18 rows) + +Dump ingest requests: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2020-05-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/oai_noingest_20200506.requests.json'; + => COPY 49491452 + + cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/20200114_bulk_ingests.md b/notes/ingest/20200114_bulk_ingests.md deleted file mode 100644 index 9d05cda..0000000 --- a/notes/ingest/20200114_bulk_ingests.md +++ /dev/null @@ -1,26 +0,0 @@ - -Generate ingest requests from arabesque: - - zcat /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source arxiv --extid-type arxiv --release-stage submitted - | shuf > /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json - - zcat /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source pmc --extid-type pmcid - | shuf > /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json - - -Quick tests locally: - - time head -n100 /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_arxiv.json - time head -n100 /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_pubmed.json - -These are all wayback success; looking good! Single threaded, from home laptop -(over tunnel), took about 9 minutes, or 5.5sec/pdf. That's pretty slow even -with 30x parallelism. Should re-test on actual server. GROBID pre-check should -help? - -With new bulk topic: - - head PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - -Ok, let them rip: - - cat PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 - cat ARXIV-CRAWL-2019-10.arabesque.ingest_request.json | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 -- cgit v1.2.3