aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-02 17:47:23 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-02 17:47:23 -0800
commitb0b66c20c6ffb9d8acc626068964d7dfd5d3bcdc (patch)
tree271f3f7a5cd86031cb294496666fcabf355dabe4 /notes/ingest
parent88c575d982b0ef87c282fedd018feaf389492267 (diff)
downloadsandcrawler-b0b66c20c6ffb9d8acc626068964d7dfd5d3bcdc.tar.gz
sandcrawler-b0b66c20c6ffb9d8acc626068964d7dfd5d3bcdc.zip
unpaywall oct 2020 crawl notes
Diffstat (limited to 'notes/ingest')
-rw-r--r--notes/ingest/2020-10_unpaywall.md127
1 files changed, 82 insertions, 45 deletions
diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md
index c1c8388..7b97d08 100644
--- a/notes/ingest/2020-10_unpaywall.md
+++ b/notes/ingest/2020-10_unpaywall.md
@@ -49,7 +49,8 @@ Start small, to test no-capture behavior:
cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
-Overall status after that:
+
+## Check Pre-Crawl Status
SELECT ingest_file_result.status, COUNT(*)
FROM ingest_request
@@ -61,55 +62,91 @@ Overall status after that:
AND ingest_request.link_source = 'unpaywall'
GROUP BY status
ORDER BY COUNT DESC
- LIMIT 25;
-
- status | count
- -------------------------------------+----------
- success | 23661084
- no-capture | 3015448
- no-pdf-link | 2302092
- redirect-loop | 1542484
- terminal-bad-status | 1044654
- wrong-mimetype | 114315
- link-loop | 36357
- cdx-error | 20055
- null-body | 14513
- wayback-error | 14175
- gateway-timeout | 3747
- spn2-cdx-lookup-failure | 1250
- petabox-error | 1171
- redirects-exceeded | 752
- invalid-host-resolution | 464
- bad-redirect | 131
- spn2-error | 109
- spn2-error:job-failed | 91
- timeout | 19
- | 13
- spn2-error:soft-time-limit-exceeded | 9
- bad-gzip-encoding | 6
- spn2-error:pending | 1
- skip-url-blocklist | 1
- pending | 1
- (25 rows)
-
-## Crawl
-
-Re-crawl broadly (eg, all URLs that have failed before, not just `no-capture`):
+ LIMIT 20;
+
+
+ status | count
+ -------------------------+----------
+ success | 23661282
+ no-capture | 3015447
+ no-pdf-link | 2302102
+ redirect-loop | 1542566
+ terminal-bad-status | 1044676
+ wrong-mimetype | 114315
+ link-loop | 36358
+ cdx-error | 20150
+ null-body | 14513
+ wayback-error | 13644
+ gateway-timeout | 3776
+ spn2-cdx-lookup-failure | 1260
+ petabox-error | 1171
+ redirects-exceeded | 752
+ invalid-host-resolution | 464
+ spn2-error | 147
+ bad-redirect | 131
+ spn2-error:job-failed | 91
+ wayback-content-error | 45
+ timeout | 19
+ (20 rows)
+
+## Dump Seedlist
+
+Dump rows:
COPY (
- SELECT row_to_json(r) FROM (
- SELECT ingest_request.*, ingest_file_result.terminal_url as terminal_url
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
FROM ingest_request
LEFT JOIN ingest_file_result
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
ingest_request.ingest_type = 'pdf'
- AND ingest_request.ingest_request_source = 'unpaywall'
- AND ingest_file_result.status != 'success'
- ) r
- ) TO '/grande/snapshots/oa_doi_reingest_recrawl_20201014.rows.json';
- => 8111845
-
-Hrm. Not sure how to feel about the no-pdf-link. Guess it is fine!
-
+ AND ingest_request.link_source = 'unpaywall'
+ AND (ingest_file_result.status = 'no-capture'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'spn2-cdx-lookup-failure'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ ) t1
+ ) TO '/grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json';
+ => 2,936,404
+
+ # TODO: in the future also exclude "www.archive.org"
+
+Prep ingest requests (for post-crawl use):
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | pv -l > /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json
+
+And actually dump seedlist(s):
+
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.url.txt
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.terminal_url.txt
+ cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.no_terminal_url.txt
+
+ wc -l unpaywall_seedlist_2020-11-02.*.txt
+ 2701178 unpaywall_seedlist_2020-11-02.terminal_url.txt
+ 2713866 unpaywall_seedlist_2020-11-02.url.txt
+
+With things like jsessionid, suspect that crawling just the terminal URLs is
+going to work better than both full and terminal.
+
+Finding a fraction of `no-capture` which have partial/stub URLs as terminal.
+
+TODO: investigate scale of partial/stub `terminal_url` (eg, not HTTP/S or FTP).