diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-14 14:39:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-14 14:39:30 -0700 |
commit | 2114cce448c5ff0424f667ba5298010722965d73 (patch) | |
tree | 610570f0adce8b392e79d57cf7e65778151d04a0 /notes | |
parent | c32b13949a06e9e834c1fac40e3609cb8eeb9f31 (diff) | |
download | sandcrawler-2114cce448c5ff0424f667ba5298010722965d73.tar.gz sandcrawler-2114cce448c5ff0424f667ba5298010722965d73.zip |
unpaywall crawl wrap-up notes
Diffstat (limited to 'notes')
-rw-r--r-- | notes/ingest/2022-04_unpaywall.md | 147 |
1 files changed, 145 insertions, 2 deletions
diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md index 600b2d6..bc78998 100644 --- a/notes/ingest/2022-04_unpaywall.md +++ b/notes/ingest/2022-04_unpaywall.md @@ -5,6 +5,10 @@ This will probably be the last unpaywall crawl? Will switch to openalex in the future, because we can automate that ingest process, and run it on our own schedule. + export SNAPSHOT=2022-03-09 + export CRAWLVM=wbgrp-svc279.us.archive.org + export CRAWLNAME=UNPAYWALL-CRAWL-2022-04 + ## Download and Archive wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz' @@ -76,6 +80,59 @@ Only the recent bulk ingest: ORDER BY COUNT DESC LIMIT 20; + status | count + -------------------------+--------- + no-capture | 3330232 + success | 2455102 + redirect-loop | 197117 + terminal-bad-status | 82618 + no-pdf-link | 33046 + blocked-cookie | 16078 + link-loop | 6745 + wrong-mimetype | 3416 + wayback-error | 1385 + empty-blob | 1142 + cdx-error | 820 + body-too-large | 292 + bad-gzip-encoding | 281 + wayback-content-error | 267 + | 253 + petabox-error | 215 + skip-url-blocklist | 185 + null-body | 179 + spn2-cdx-lookup-failure | 89 + gateway-timeout | 73 + (20 rows) + +After prior "TARGETED" crawl and bulk ingest finished: + + status | count + -------------------------+--------- + no-capture | 3330055 + success | 2455279 + redirect-loop | 197117 + terminal-bad-status | 82618 + no-pdf-link | 33046 + blocked-cookie | 16079 + link-loop | 6745 + wrong-mimetype | 3416 + wayback-error | 1385 + empty-blob | 1142 + cdx-error | 820 + body-too-large | 292 + bad-gzip-encoding | 281 + wayback-content-error | 267 + | 253 + petabox-error | 215 + skip-url-blocklist | 185 + null-body | 179 + spn2-cdx-lookup-failure | 89 + gateway-timeout | 73 + (20 rows) + +Almost no change, which makes sense because of the `ingest_request.created` +filter. + ## Dump Seedlist @@ -108,6 +165,7 @@ Dump rows for crawling: AND ingest_request.base_url NOT LIKE '%tandfonline.com%' AND ingest_request.base_url NOT LIKE '%.archive.org%' AND ingest_request.base_url NOT LIKE '%://archive.org%' + AND ingest_request.base_url NOT LIKE '%://doi.org/10.48550/%' AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' @@ -119,10 +177,14 @@ Dump rows for crawling: AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%' ) t1 ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json'; + => before ingest and arxiv.org DOI exclusion: COPY 3309091 + => COPY 3308914 + Prep ingest requests (for post-crawl use): ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json + => 3.31M 0:02:22 [23.2k/s] And actually dump seedlist(s): @@ -130,6 +192,87 @@ And actually dump seedlist(s): cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt - wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.*.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt | awk '{print "F+ " $1}' | shuf > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule + + wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT* + 15 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.no_terminal_url.txt + 3308914 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json + 3028879 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.terminal_url.txt + 3038725 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.url.txt + +Inject seedlist into crawler: + + scp /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule $CRAWLVM:/tmp + ssh $CRAWLVM sudo -u heritrix cp /tmp/unpaywall_seedlist_$SNAPSHOT.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/ + +Top domains? + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule | cut -f2 -d' ' | cut -f3 -d/ | sort -S 4G | uniq -c | sort -nr | head -n20 + 158497 www.scielo.br + 144732 onlinelibrary.wiley.com + 129349 www.researchsquare.com + 94923 hal.archives-ouvertes.fr + 69293 openresearchlibrary.org + 64584 www.cell.com + 60033 link.springer.com + 50528 www.degruyter.com + 49737 projecteuclid.org + 45841 www.jstage.jst.go.jp + 44819 www.mdpi.com + 44325 ieeexplore.ieee.org + 38091 dr.lib.iastate.edu + 31030 www.nature.com + 30300 discovery.ucl.ac.uk + 27692 ntrs.nasa.gov + 24215 orca.cardiff.ac.uk + 23653 www.frontiersin.org + 23474 pure.rug.nl + 22660 www.sciencedirect.com + + +## Post-Crawl bulk ingest + + # enqueue for bulk processing + cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # done: 2022-07-06 + +## Post-Crawl, Post-Ingest Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2022-04-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 4784948 => +2,329,669 ~77% + redirect-loop | 485270 => + 288,153 ~10% + no-capture | 317598 => -3,012,457 + terminal-bad-status | 267853 => + 185,235 ~ 6% + no-pdf-link | 118303 => + 85,257 + blocked-cookie | 111373 => + 95,294 + skip-url-blocklist | 19368 + link-loop | 9091 + wrong-mimetype | 7163 + cdx-error | 2516 + empty-blob | 1961 + wayback-error | 1922 + body-too-large | 509 + petabox-error | 416 + wayback-content-error | 341 + bad-gzip-encoding | 281 + | 253 + null-body | 179 + spn2-cdx-lookup-failure | 89 + gateway-timeout | 73 + (20 rows) -Then run crawl (see `journal-crawls` git repo), including frontier generation. +Groovy! |