From 257f41b174e04957aecf298b3ecdaae0ab44a1d2 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 11 Mar 2022 12:24:19 -0800
Subject: DOAJ ingest/crawl notes

---
 notes/ingest/2022-03_doaj.md | 266 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 notes/ingest/2022-03_doaj.md

diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md
new file mode 100644
index 0000000..bace480
--- /dev/null
+++ b/notes/ingest/2022-03_doaj.md
@@ -0,0 +1,266 @@
+
+plan:
+- usual setup and dump ingest requests
+- filter ingest requests to targetted ccTLDs, and add those to crawl first
+
+## Transform and Load
+
+    # on sandcrawler-vm
+    mkdir -p /srv/sandcrawler/tasks/doaj
+    cd /srv/sandcrawler/tasks/doaj
+    wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz'
+
+    # in pipenv, in python directory
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz
+    # 9.08M 0:37:38 [4.02k/s]
+
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+    # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0})
+    # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373})
+
+
+## Check Pre-Crawl Status
+
+2022-03-09, before the above load:
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    -- next time include ingest_type in sort
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+     ingest_type |          status          |  count  
+    -------------+--------------------------+---------
+     pdf         | success                  | 2919808
+     html        | wrong-scope              | 1098998
+     pdf         | no-pdf-link              |  481532
+     pdf         | redirect-loop            |  429006
+     html        | success                  |  342501
+     html        | unknown-scope            |  225390
+     html        | redirect-loop            |  223927
+     html        | html-resource-no-capture |  187762
+     html        | no-capture               |  185418
+     pdf         | no-capture               |  171273
+     pdf         | null-body                |  129028
+     html        | null-body                |  100296
+     pdf         | terminal-bad-status      |   91551
+     pdf         | link-loop                |   25447
+     html        | wrong-mimetype           |   22640
+     html        | wayback-content-error    |   19028
+     html        | terminal-bad-status      |   13327
+     pdf         | wrong-mimetype           |    7688
+     xml         | success                  |    6897
+     html        | petabox-error            |    5529
+     pdf         | wayback-error            |    2706
+     xml         | null-body                |    2353
+     pdf         |                          |    2063
+     pdf         | wayback-content-error    |    1349
+     html        | cdx-error                |    1169
+     pdf         | cdx-error                |    1130
+     pdf         | petabox-error            |     679
+     html        |                          |     620
+     pdf         | empty-blob               |     562
+     html        | blocked-cookie           |     545
+    (30 rows)
+
+After the above load:
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 3036457
+     pdf         |                          | 1623208
+     html        |                          | 1208412
+     html        | wrong-scope              | 1108132
+     pdf         | no-pdf-link              |  485703
+     pdf         | redirect-loop            |  436085
+     html        | success                  |  342594
+     html        | unknown-scope            |  225412
+     html        | redirect-loop            |  223927
+     html        | html-resource-no-capture |  187999
+     html        | no-capture               |  187310
+     pdf         | no-capture               |  172033
+     pdf         | null-body                |  129266
+     html        | null-body                |  100296
+     pdf         | terminal-bad-status      |   91799
+     pdf         | link-loop                |   26933
+     html        | wrong-mimetype           |   22643
+     html        | wayback-content-error    |   19028
+     html        | terminal-bad-status      |   13327
+     xml         |                          |   11196
+     pdf         | wrong-mimetype           |    7929
+     xml         | success                  |    6897
+     html        | petabox-error            |    5530
+     pdf         | wayback-error            |    2707
+     xml         | null-body                |    2353
+     pdf         | wayback-content-error    |    1353
+     pdf         | cdx-error                |    1177
+     html        | cdx-error                |    1172
+     pdf         | petabox-error            |     771
+     pdf         | empty-blob               |     562
+    (30 rows)
+
+Dump ingest requests for crawling (or bulk ingest first?):
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.base_url = ingest_request.base_url
+                AND ingest_file_result.ingest_type = ingest_request.ingest_type
+            WHERE
+                ingest_request.link_source = 'doaj'
+                -- AND (ingest_request.ingest_type = 'pdf'
+                --    OR ingest_request.ingest_type = 'xml')
+                AND (
+                    ingest_file_result.status IS NULL
+                    OR ingest_file_result.status = 'no-capture'
+                )
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+        ) t1
+    ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json';
+    => COPY 353819
+
+Not that many! Guess the filters are important?
+
+    SELECT COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.base_url = ingest_request.base_url
+        AND ingest_file_result.ingest_type = ingest_request.ingest_type
+    WHERE
+        ingest_request.link_source = 'doaj'
+        -- AND (ingest_request.ingest_type = 'pdf'
+        --    OR ingest_request.ingest_type = 'xml')
+        AND (
+            ingest_file_result.status IS NULL
+            OR ingest_file_result.status = 'no-capture'
+        );
+    => 3202164
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json
+    => 353k 0:00:16 [21.0k/s]
+
+Bulk ingest:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Dump seeds again (for crawling):
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.base_url = ingest_request.base_url
+                AND ingest_file_result.ingest_type = ingest_request.ingest_type
+            WHERE
+                ingest_request.link_source = 'doaj'
+                -- AND (ingest_request.ingest_type = 'pdf'
+                --    OR ingest_request.ingest_type = 'xml')
+                AND (
+                    ingest_file_result.status IS NULL
+                    OR ingest_file_result.status = 'no-capture'
+                )
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+        ) t1
+    ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json';
+    # COPY 350661
+
+And stats again:
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 3037059
+     pdf         |                          | 1623208
+     html        |                          | 1208412
+     html        | wrong-scope              | 1108476
+     pdf         | no-pdf-link              |  485705
+     pdf         | redirect-loop            |  436850
+     html        | success                  |  342762
+     html        | unknown-scope            |  225412
+     html        | redirect-loop            |  224683
+     html        | html-resource-no-capture |  188058
+     html        | no-capture               |  185734
+     pdf         | no-capture               |  170452
+     pdf         | null-body                |  129266
+     html        | null-body                |  100296
+     pdf         | terminal-bad-status      |   91875
+     pdf         | link-loop                |   26933
+     html        | wrong-mimetype           |   22643
+     html        | wayback-content-error    |   19042
+     html        | terminal-bad-status      |   13333
+     xml         |                          |   11196
+     pdf         | wrong-mimetype           |    7929
+     xml         | success                  |    6898
+     html        | petabox-error            |    5535
+     pdf         | wayback-error            |    2711
+     xml         | null-body                |    2353
+     pdf         | wayback-content-error    |    1353
+     pdf         | cdx-error                |    1177
+     html        | cdx-error                |    1172
+     pdf         | petabox-error            |     772
+     html        | blocked-cookie           |     769
+    (30 rows)
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json
+
+Create seedlist:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \
+        | jq -r .base_url \
+        | sort -u -S 4G \
+        > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt
+
+Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will
+re-ingest when that completes (a week or two?).
-- 
cgit v1.2.3