unpaywall crawl wrap-up notes

author: Bryan Newbold <bnewbold@archive.org> 2022-07-14 14:39:30 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-07-14 14:39:30 -0700
commit: 2114cce448c5ff0424f667ba5298010722965d73 (patch)
tree: 610570f0adce8b392e79d57cf7e65778151d04a0
parent: c32b13949a06e9e834c1fac40e3609cb8eeb9f31 (diff)
download: sandcrawler-2114cce448c5ff0424f667ba5298010722965d73.tar.gz
sandcrawler-2114cce448c5ff0424f667ba5298010722965d73.zip
1 files changed, 145 insertions, 2 deletions
diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md
index 600b2d6..bc78998 100644
--- a/notes/ingest/2022-04_unpaywall.md
+++ b/notes/ingest/2022-04_unpaywall.md
@@ -5,6 +5,10 @@ This will probably be the last unpaywall crawl? Will switch to openalex in the
 future, because we can automate that ingest process, and run it on our own
 schedule.
 
+    export SNAPSHOT=2022-03-09
+    export CRAWLVM=wbgrp-svc279.us.archive.org
+    export CRAWLNAME=UNPAYWALL-CRAWL-2022-04
+
 ## Download and Archive
 
     wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz'
@@ -76,6 +80,59 @@ Only the recent bulk ingest:
     ORDER BY COUNT DESC
     LIMIT 20;
 
+             status          |  count
+    -------------------------+---------
+     no-capture              | 3330232
+     success                 | 2455102
+     redirect-loop           |  197117
+     terminal-bad-status     |   82618
+     no-pdf-link             |   33046
+     blocked-cookie          |   16078
+     link-loop               |    6745
+     wrong-mimetype          |    3416
+     wayback-error           |    1385
+     empty-blob              |    1142
+     cdx-error               |     820
+     body-too-large          |     292
+     bad-gzip-encoding       |     281
+     wayback-content-error   |     267
+                             |     253
+     petabox-error           |     215
+     skip-url-blocklist      |     185
+     null-body               |     179
+     spn2-cdx-lookup-failure |      89
+     gateway-timeout         |      73
+    (20 rows)
+
+After prior "TARGETED" crawl and bulk ingest finished:
+
+             status          |  count
+    -------------------------+---------
+     no-capture              | 3330055
+     success                 | 2455279
+     redirect-loop           |  197117
+     terminal-bad-status     |   82618
+     no-pdf-link             |   33046
+     blocked-cookie          |   16079
+     link-loop               |    6745
+     wrong-mimetype          |    3416
+     wayback-error           |    1385
+     empty-blob              |    1142
+     cdx-error               |     820
+     body-too-large          |     292
+     bad-gzip-encoding       |     281
+     wayback-content-error   |     267
+                             |     253
+     petabox-error           |     215
+     skip-url-blocklist      |     185
+     null-body               |     179
+     spn2-cdx-lookup-failure |      89
+     gateway-timeout         |      73
+    (20 rows)
+
+Almost no change, which makes sense because of the `ingest_request.created`
+filter.
+
 
 ## Dump Seedlist
 
@@ -108,6 +165,7 @@ Dump rows for crawling:
                 AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
                 AND ingest_request.base_url NOT LIKE '%.archive.org%'
                 AND ingest_request.base_url NOT LIKE '%://archive.org%'
+                AND ingest_request.base_url NOT LIKE '%://doi.org/10.48550/%'
                 AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
                 AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
                 AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
@@ -119,10 +177,14 @@ Dump rows for crawling:
                 AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
         ) t1
     ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json';
+    => before ingest and arxiv.org DOI exclusion: COPY 3309091
+    => COPY 3308914
+
 
 Prep ingest requests (for post-crawl use):
 
     ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json
+    => 3.31M 0:02:22 [23.2k/s]
 
 And actually dump seedlist(s):
 
@@ -130,6 +192,87 @@ And actually dump seedlist(s):
     cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt
     cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt
 
-    wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.*.txt
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt | awk '{print "F+ " $1}' | shuf > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule
+
+    wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT*
+            15 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.no_terminal_url.txt
+       3308914 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json
+       3028879 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.terminal_url.txt
+       3038725 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.url.txt
+
+Inject seedlist into crawler:
+
+    scp /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule $CRAWLVM:/tmp
+    ssh $CRAWLVM sudo -u heritrix cp /tmp/unpaywall_seedlist_$SNAPSHOT.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+Top domains?
+
+    cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule | cut -f2 -d' ' | cut -f3 -d/ | sort -S 4G | uniq -c |  sort -nr | head -n20
+     158497 www.scielo.br
+     144732 onlinelibrary.wiley.com
+     129349 www.researchsquare.com
+      94923 hal.archives-ouvertes.fr
+      69293 openresearchlibrary.org
+      64584 www.cell.com
+      60033 link.springer.com
+      50528 www.degruyter.com
+      49737 projecteuclid.org
+      45841 www.jstage.jst.go.jp
+      44819 www.mdpi.com
+      44325 ieeexplore.ieee.org
+      38091 dr.lib.iastate.edu
+      31030 www.nature.com
+      30300 discovery.ucl.ac.uk
+      27692 ntrs.nasa.gov
+      24215 orca.cardiff.ac.uk
+      23653 www.frontiersin.org
+      23474 pure.rug.nl
+      22660 www.sciencedirect.com
+
+
+## Post-Crawl bulk ingest
+
+    # enqueue for bulk processing
+    cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # done: 2022-07-06
+
+## Post-Crawl, Post-Ingest Stats
+
+    SELECT ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.ingest_type = 'pdf'
+        AND ingest_request.link_source = 'unpaywall'
+        AND date(ingest_request.created) > '2022-04-01'
+    GROUP BY status
+    ORDER BY COUNT DESC
+    LIMIT 20;
+
+             status          |  count  
+    -------------------------+---------
+     success                 | 4784948 => +2,329,669  ~77%
+     redirect-loop           |  485270 => +  288,153  ~10%
+     no-capture              |  317598 => -3,012,457
+     terminal-bad-status     |  267853 => +  185,235  ~ 6%
+     no-pdf-link             |  118303 => +   85,257
+     blocked-cookie          |  111373 => +   95,294
+     skip-url-blocklist      |   19368
+     link-loop               |    9091
+     wrong-mimetype          |    7163
+     cdx-error               |    2516
+     empty-blob              |    1961
+     wayback-error           |    1922
+     body-too-large          |     509
+     petabox-error           |     416
+     wayback-content-error   |     341
+     bad-gzip-encoding       |     281
+                             |     253
+     null-body               |     179
+     spn2-cdx-lookup-failure |      89
+     gateway-timeout         |      73
+    (20 rows)
 
-Then run crawl (see `journal-crawls` git repo), including frontier generation.
+Groovy!
author	Bryan Newbold <bnewbold@archive.org>	2022-07-14 14:39:30 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-07-14 14:39:30 -0700
commit	2114cce448c5ff0424f667ba5298010722965d73 (patch)
tree	610570f0adce8b392e79d57cf7e65778151d04a0
parent	c32b13949a06e9e834c1fac40e3609cb8eeb9f31 (diff)
download	sandcrawler-2114cce448c5ff0424f667ba5298010722965d73.tar.gz sandcrawler-2114cce448c5ff0424f667ba5298010722965d73.zip