From 476fa2ff8c5e561287390505c17caf1888d6b9f4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 28 Dec 2020 23:24:02 -0800 Subject: progress notes on OA DOI ingest (still running) --- notes/ingest/2020-09_oa_doi.md | 113 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 11 deletions(-) diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md index c8ed6e9..2b4cb57 100644 --- a/notes/ingest/2020-09_oa_doi.md +++ b/notes/ingest/2020-09_oa_doi.md @@ -158,7 +158,7 @@ What top domains for those `no-pdf-link` (or similar)? osf.io | terminal-bad-status | 2388 (30 rows) -Will look at link extraction for: +Should look at link extraction for: - scialert.net - utpjournals.press @@ -171,7 +171,7 @@ Will look at link extraction for: ## Re-Ingest -Going to re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs: +Re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs: COPY ( SELECT row_to_json(ingest_request.*) @@ -196,23 +196,114 @@ Going to re-run ingest to handle `no-capture` cases, to extract the missing term cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 -After that, will re-crawl somewhat broadly: +TODO below here... + +Resuming progress on this in early December 2020. + +Filtered requests to re-crawl: COPY ( - SELECT row_to_json(r) FROM ( - SELECT ingest_request.*, ingest_file_result.terminal_url as terminal_url + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result FROM ingest_request LEFT JOIN ingest_file_result - ON ingest_file_result.ingest_type = ingest_request.ingest_type - AND ingest_file_result.base_url = ingest_request.base_url + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type WHERE - ingest_request.ingest_type = 'pdf' - AND ingest_request.link_source = 'doi' + ingest_request.link_source = 'doi' AND (ingest_request.ingest_request_source = 'fatcat-ingest' OR ingest_request.ingest_request_source = 'fatcat-changelog') AND ((ingest_file_result.updated >= '2020-09-15' AND ingest_file_result.updated <= '2020-09-20') OR (ingest_file_result.updated >= '2020-10-11')) AND ingest_file_result.status != 'success' - ) r - ) TO '/grande/snapshots/oa_doi_reingest_recrawl_20201014.rows.json'; + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json'; + => COPY 2352614 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | pv -l > /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json + +And actually dump seedlist(s): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | jq -r .base_url | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt + + wc -l /grande/snapshots/oa_doi_seedlist_2020-12-08.*.txt + 2352614 /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt + 481910 /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt + +Top DOI prefixes (same old usual suspects): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg "://doi.org/" | cut -f4 -d/ | sort | uniq -c | sort -nr | head -n20 + 353695 10.5281 zenodo.org + 121888 10.6084 figshare.org + 115093 10.3917 cairn.info + 113252 10.3406 persee.fr + 95414 10.1515 degruyter.com + 90448 10.4324 taylorfrancis.com + 83927 10.1016 elsevier + 60303 10.1109 IEEE + 48490 10.4000 openedition.org + 28498 10.3205 egms.de + 23433 10.1163 brill.com + 23276 10.17615 cdr.lib.unc.edu + 21386 10.1093 oup.com + 20783 10.3138 utpjournals.press + 19987 10.1201 tandfonline.com + 17916 10.34847 cocoon.huma-num.fr + 16970 10.1002 wiley.com + 15958 10.1097 lww.com (and others?) + 15835 10.1017 cambridge.org + 15466 10.24355 publikationsserver.tu-braunschweig.de (IR) + +Top domains (not doi.org): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg -v "://doi.org/" | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20 + 104148 zenodo.org + 85245 www.persee.fr + 52931 www.cairn.info + 4791 www.jstage.jst.go.jp + 4411 archive.monthlyreview.org + 4129 osf.io + 2841 www.indianjournals.com + 2746 www.impan.pl + 2620 platform.almanhal.com + 2019 www.nomos-elibrary.de + 1209 dergipark.org.tr + 1027 pubs.geoscienceworld.org + 973 www.pdcnet.org + 923 www.hanspub.org + 914 www.repository.cam.ac.uk + 863 mediarep.org + 812 www.cartographicperspectives.org + 687 www.degruyter.com + 578 192.168.7.24 + 566 journals.eco-vector.com + +TODO: infer `publisher_type` and platform from DOI prefix in more cases + +## Re-Ingest +TODO after crawl completion -- cgit v1.2.3