aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-03-11 12:24:19 -0800
committerBryan Newbold <bnewbold@archive.org>2022-03-11 12:24:19 -0800
commit257f41b174e04957aecf298b3ecdaae0ab44a1d2 (patch)
tree93a5ba9fafe6823474d995118d0626ed2200ea27
parent9d096b26e35802553263d6472a534deb381e65da (diff)
downloadsandcrawler-257f41b174e04957aecf298b3ecdaae0ab44a1d2.tar.gz
sandcrawler-257f41b174e04957aecf298b3ecdaae0ab44a1d2.zip
DOAJ ingest/crawl notes
-rw-r--r--notes/ingest/2022-03_doaj.md266
1 files changed, 266 insertions, 0 deletions
diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md
new file mode 100644
index 0000000..bace480
--- /dev/null
+++ b/notes/ingest/2022-03_doaj.md
@@ -0,0 +1,266 @@
+
+plan:
+- usual setup and dump ingest requests
+- filter ingest requests to targetted ccTLDs, and add those to crawl first
+
+## Transform and Load
+
+ # on sandcrawler-vm
+ mkdir -p /srv/sandcrawler/tasks/doaj
+ cd /srv/sandcrawler/tasks/doaj
+ wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz'
+
+ # in pipenv, in python directory
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz
+ # 9.08M 0:37:38 [4.02k/s]
+
+ zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+ # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0})
+ # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373})
+
+
+## Check Pre-Crawl Status
+
+2022-03-09, before the above load:
+
+ SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.link_source = 'doaj'
+ GROUP BY ingest_request.ingest_type, status
+ -- next time include ingest_type in sort
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 2919808
+ html | wrong-scope | 1098998
+ pdf | no-pdf-link | 481532
+ pdf | redirect-loop | 429006
+ html | success | 342501
+ html | unknown-scope | 225390
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187762
+ html | no-capture | 185418
+ pdf | no-capture | 171273
+ pdf | null-body | 129028
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91551
+ pdf | link-loop | 25447
+ html | wrong-mimetype | 22640
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ pdf | wrong-mimetype | 7688
+ xml | success | 6897
+ html | petabox-error | 5529
+ pdf | wayback-error | 2706
+ xml | null-body | 2353
+ pdf | | 2063
+ pdf | wayback-content-error | 1349
+ html | cdx-error | 1169
+ pdf | cdx-error | 1130
+ pdf | petabox-error | 679
+ html | | 620
+ pdf | empty-blob | 562
+ html | blocked-cookie | 545
+ (30 rows)
+
+After the above load:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3036457
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108132
+ pdf | no-pdf-link | 485703
+ pdf | redirect-loop | 436085
+ html | success | 342594
+ html | unknown-scope | 225412
+ html | redirect-loop | 223927
+ html | html-resource-no-capture | 187999
+ html | no-capture | 187310
+ pdf | no-capture | 172033
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91799
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19028
+ html | terminal-bad-status | 13327
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6897
+ html | petabox-error | 5530
+ pdf | wayback-error | 2707
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 771
+ pdf | empty-blob | 562
+ (30 rows)
+
+Dump ingest requests for crawling (or bulk ingest first?):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json';
+ => COPY 353819
+
+Not that many! Guess the filters are important?
+
+ SELECT COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ );
+ => 3202164
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json
+ => 353k 0:00:16 [21.0k/s]
+
+Bulk ingest:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Dump seeds again (for crawling):
+
+ COPY (
+ SELECT row_to_json(t1.*)
+ FROM (
+ SELECT ingest_request.*, ingest_file_result as result
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_request.link_source = 'doaj'
+ -- AND (ingest_request.ingest_type = 'pdf'
+ -- OR ingest_request.ingest_type = 'xml')
+ AND (
+ ingest_file_result.status IS NULL
+ OR ingest_file_result.status = 'no-capture'
+ )
+ AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+ AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+ AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+ AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+ AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+ AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+ AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+ ) t1
+ ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json';
+ # COPY 350661
+
+And stats again:
+
+ ingest_type | status | count
+ -------------+--------------------------+---------
+ pdf | success | 3037059
+ pdf | | 1623208
+ html | | 1208412
+ html | wrong-scope | 1108476
+ pdf | no-pdf-link | 485705
+ pdf | redirect-loop | 436850
+ html | success | 342762
+ html | unknown-scope | 225412
+ html | redirect-loop | 224683
+ html | html-resource-no-capture | 188058
+ html | no-capture | 185734
+ pdf | no-capture | 170452
+ pdf | null-body | 129266
+ html | null-body | 100296
+ pdf | terminal-bad-status | 91875
+ pdf | link-loop | 26933
+ html | wrong-mimetype | 22643
+ html | wayback-content-error | 19042
+ html | terminal-bad-status | 13333
+ xml | | 11196
+ pdf | wrong-mimetype | 7929
+ xml | success | 6898
+ html | petabox-error | 5535
+ pdf | wayback-error | 2711
+ xml | null-body | 2353
+ pdf | wayback-content-error | 1353
+ pdf | cdx-error | 1177
+ html | cdx-error | 1172
+ pdf | petabox-error | 772
+ html | blocked-cookie | 769
+ (30 rows)
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json
+
+Create seedlist:
+
+ cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+ | jq -r .base_url \
+ | sort -u -S 4G \
+ > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt
+
+Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will
+re-ingest when that completes (a week or two?).