aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-05-27 11:56:07 -0700
committerBryan Newbold <bnewbold@archive.org>2021-05-27 11:56:07 -0700
commitc5228d8c9f7ce0613e9016e9b85212665c2eed98 (patch)
tree01c2758fc66f72474bb744440627cafc5d890a54 /notes/ingest
parentd5ad139fe271c994643fde128ca22be266cb3003 (diff)
downloadsandcrawler-c5228d8c9f7ce0613e9016e9b85212665c2eed98.tar.gz
sandcrawler-c5228d8c9f7ce0613e9016e9b85212665c2eed98.zip
2021-04 unpaywall crawl notes
Diffstat (limited to 'notes/ingest')
-rw-r--r--notes/ingest/2021-04_unpaywall.md368
1 files changed, 368 insertions, 0 deletions
diff --git a/notes/ingest/2021-04_unpaywall.md b/notes/ingest/2021-04_unpaywall.md
new file mode 100644
index 0000000..d7643f4
--- /dev/null
+++ b/notes/ingest/2021-04_unpaywall.md
@@ -0,0 +1,368 @@
+
+New snapshot released 2021-02-18, finally getting around to a crawl two months
+later.
+
+Intend to do same style of crawl as in the past. One change is that
+sandcrawler-db has moved to a focal VM.
+
+
+## Transform and Load
+
+ # in sandcrawler pipenv on sandcrawler1-vm (svc506)
+ zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18T160139.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json
+ => 30.0M 3:14:59 [2.57k/s]
+
+ cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+ => Worker: Counter({'total': 30027007, 'insert-requests': 2703999, 'update-requests': 0})
+ => JSON lines pushed: Counter({'total': 30027007, 'pushed': 30027007})
+
+## Dump new URLs, Transform, Bulk Ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ -- AND date(ingest_request.created) > '2021-01-01'
+ AND (ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture')
+ ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json';
+ => COPY 3277484
+
+ # previous, 2020-10 run: COPY 4216339
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json
+ => 3.28M 0:01:42 [32.1k/s]
+
+Enqueue the whole batch:
+
+ cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+
+## Check Pre-Crawl Status
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+----------
+ success | 26385866
+ no-pdf-link | 2132565
+ no-capture | 2092111
+ redirect-loop | 1732543
+ terminal-bad-status | 1504555
+ wayback-content-error | 357345
+ wrong-mimetype | 126070
+ link-loop | 76808
+ cdx-error | 22756
+ null-body | 22066
+ wayback-error | 13768
+ gateway-timeout | 3804
+ petabox-error | 3608
+ spn2-cdx-lookup-failure | 1225
+ redirects-exceeded | 892
+ invalid-host-resolution | 505
+ bad-redirect | 151
+ spn2-error | 108
+ spn2-error:job-failed | 91
+ bad-gzip-encoding | 27
+ (20 rows)
+
+Only the recent bulk ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-01-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 1348623
+ no-capture | 1231582
+ redirect-loop | 45622
+ no-pdf-link | 37312
+ terminal-bad-status | 24162
+ wrong-mimetype | 6684
+ link-loop | 5757
+ null-body | 1288
+ wayback-content-error | 1123
+ cdx-error | 831
+ petabox-error | 697
+ wayback-error | 185
+ invalid-host-resolution | 41
+ gateway-timeout | 29
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ spn2-cdx-lookup-failure | 7
+ bad-redirect | 4
+ timeout | 3
+ redirects-exceeded | 3
+ (20 rows)
+
+## Dump Seedlist
+
+Dump rows:
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%.archive.org%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json';
+ => 2020-10: 2,936,404
+ => 2021-04: 1,805,192
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json
+ => 1.81M 0:01:27 [20.6k/s]
+
+And actually dump seedlist(s):
+
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+
+ wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.*.txt
+ 6 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt
+ 1668524 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt
+ 1685717 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt
+
+## Post-Crawl Bulk Ingest
+
+ cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => 1,804,211 consumer group lag
+
+## Post-Ingest Stats
+
+Overall status (unpaywall, all time):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ success | 27242251
+ no-pdf-link | 2746237
+ redirect-loop | 1821132
+ terminal-bad-status | 1553441
+ no-capture | 478559
+ wayback-content-error | 357390
+ wrong-mimetype | 127365
+ link-loop | 79389
+ cdx-error | 23170
+ null-body | 23169
+ wayback-error | 13704
+ gateway-timeout | 3803
+ petabox-error | 3642
+ redirects-exceeded | 1427
+ spn2-cdx-lookup-failure | 1214
+ invalid-host-resolution | 505
+ bad-redirect | 153
+ spn2-error | 107
+ spn2-error:job-failed | 91
+ body-too-large | 84
+ (20 rows)
+
+Ingest stats broken down by publication stage:
+
+ SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY release_stage, status
+ ORDER BY release_stage, COUNT DESC
+ LIMIT 100;
+
+ release_stage | status | count
+ ---------------+-------------------------------------+----------
+ accepted | success | 1213335
+ accepted | no-pdf-link | 29292
+ accepted | redirect-loop | 12769
+ accepted | terminal-bad-status | 11264
+ accepted | no-capture | 10187
+ accepted | cdx-error | 1015
+ accepted | wayback-content-error | 757
+ accepted | wrong-mimetype | 501
+ accepted | link-loop | 407
+ accepted | wayback-error | 207
+ accepted | petabox-error | 189
+ accepted | redirects-exceeded | 125
+ accepted | null-body | 34
+ accepted | spn2-cdx-lookup-failure | 5
+ accepted | gateway-timeout | 4
+ accepted | blocked-cookie | 2
+ accepted | bad-redirect | 1
+ accepted | body-too-large | 1
+ published | success | 20196774
+ published | no-pdf-link | 2647969
+ published | redirect-loop | 1477558
+ published | terminal-bad-status | 1320013
+ published | wayback-content-error | 351931
+ published | no-capture | 297603
+ published | wrong-mimetype | 115440
+ published | link-loop | 76431
+ published | cdx-error | 18125
+ published | null-body | 17559
+ published | wayback-error | 10466
+ published | petabox-error | 2684
+ published | gateway-timeout | 1979
+ published | redirects-exceeded | 947
+ published | spn2-cdx-lookup-failure | 877
+ published | invalid-host-resolution | 457
+ published | bad-redirect | 120
+ published | spn2-error:job-failed | 77
+ published | spn2-error | 70
+ published | body-too-large | 39
+ published | bad-gzip-encoding | 24
+ published | timeout | 24
+ published | blocked-cookie | 23
+ published | spn2-error:soft-time-limit-exceeded | 4
+ published | | 2
+ published | pending | 1
+ published | spn2-error:pending | 1
+ published | too-many-redirects | 1
+ submitted | success | 5832117
+ submitted | redirect-loop | 330785
+ submitted | terminal-bad-status | 222152
+ submitted | no-capture | 170766
+ submitted | no-pdf-link | 68934
+ submitted | wrong-mimetype | 11424
+ submitted | null-body | 5576
+ submitted | wayback-content-error | 4702
+ submitted | cdx-error | 4030
+ submitted | wayback-error | 3031
+ submitted | link-loop | 2551
+ submitted | gateway-timeout | 1820
+ submitted | petabox-error | 769
+ submitted | redirects-exceeded | 355
+ submitted | spn2-cdx-lookup-failure | 332
+ submitted | invalid-host-resolution | 48
+ submitted | body-too-large | 44
+ submitted | spn2-error | 37
+ submitted | bad-redirect | 32
+ submitted | spn2-error:job-failed | 14
+ submitted | | 13
+ submitted | spn2-error:soft-time-limit-exceeded | 5
+ submitted | timeout | 4
+ submitted | bad-gzip-encoding | 3
+ submitted | skip-url-blocklist | 1
+ | no-pdf-link | 42
+ | success | 25
+ | redirect-loop | 20
+ | terminal-bad-status | 12
+ | no-capture | 3
+ (76 rows)
+
+
+Only the recent updates:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'unpaywall'
+ AND date(ingest_request.created) > '2021-04-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+---------
+ success | 2192376
+ no-capture | 152183
+ no-pdf-link | 144174
+ redirect-loop | 125988
+ terminal-bad-status | 67307
+ link-loop | 8292
+ wrong-mimetype | 7942
+ null-body | 2270
+ cdx-error | 1223
+ wayback-content-error | 1147
+ petabox-error | 728
+ wayback-error | 155
+ body-too-large | 82
+ invalid-host-resolution | 41
+ gateway-timeout | 28
+ blocked-cookie | 22
+ bad-gzip-encoding | 20
+ timeout | 7
+ bad-redirect | 6
+ redirects-exceeded | 4
+ (20 rows)
+
+In total, this iteration of unpaywall ingest resulted in:
+
+- 2,703,999 raw ingest requests (new URLs total)
+- 1,231,582 (45.5%) of these had not been seen/crawled from any source yet
+- 843,753 (31.2%) success from new heritrix crawling
+- 2,192,376 (81.1%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success)