DOAJ crawl ingest stats

author: Bryan Newbold <bnewbold@archive.org> 2020-12-31 12:17:00 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-12-31 12:17:00 -0800
commit: b0d4ac285f101a641cee0e87da80d47a468c08aa (patch)
tree: f9aabc05b6ca43528c376b56ed3ae7351c15f2b4 /notes/ingest
parent: 476fa2ff8c5e561287390505c17caf1888d6b9f4 (diff)
download: sandcrawler-b0d4ac285f101a641cee0e87da80d47a468c08aa.tar.gz
sandcrawler-b0d4ac285f101a641cee0e87da80d47a468c08aa.zip
1 files changed, 295 insertions, 0 deletions
diff --git a/notes/ingest/2020-11_doaj.md b/notes/ingest/2020-11_doaj.md
new file mode 100644
index 0000000..473dd0d
--- /dev/null
+++ b/notes/ingest/2020-11_doaj.md
@@ -0,0 +1,295 @@
+
+This is the first ingest (and crawl) of URLs from DOAJ article-level metadata.
+It will include at least 'pdf' and 'html' ingest requests, not just 'pdf' as in
+the past.
+
+Working off a 2020-11-13 snapshot.
+
+## Transform and Load
+
+    # in sandcrawler pipenv on aitio
+    zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json
+    => 6.7M 0:24:28 [4.57k/s]
+
+    cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+    => ran in to error with blank `base_url`
+
+Second try after patches:
+
+    zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json
+    => 6.7M 0:24:29 [4.56k/s]
+
+    cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request -
+    => Worker: Counter({'total': 6703036, 'insert-requests': 163854, 'update-requests': 0})
+    => JSON lines pushed: Counter({'total': 6703036, 'pushed': 6703036})
+
+## Check Pre-Crawl Status
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    -- next time include ingest_type in sort
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+
+     ingest_type |         status          |  count  
+    -------------+-------------------------+---------
+     pdf         |                         | 3711532
+     html        |                         | 2429003
+     pdf         | success                 |  454403
+     pdf         | redirect-loop           |   48587
+     pdf         | no-pdf-link             |   24901
+     pdf         | no-capture              |   11569
+     xml         |                         |    9442
+     pdf         | link-loop               |    8466
+     pdf         | terminal-bad-status     |    2015
+     pdf         | wrong-mimetype          |    1441
+     pdf         | null-body               |    1057
+     pdf         | petabox-error           |     299
+     pdf         | cdx-error               |     124
+     pdf         | gateway-timeout         |     114
+     pdf         | wayback-error           |      77
+     pdf         | spn2-cdx-lookup-failure |      20
+     pdf         | invalid-host-resolution |       4
+     pdf         | spn2-error              |       1
+    (18 rows)
+
+## Dump new URLs, Transform, Bulk Ingest (PDF and XML only)
+
+Dump:
+
+    COPY (
+        SELECT row_to_json(ingest_request.*)
+        FROM ingest_request
+        LEFT JOIN ingest_file_result
+            ON ingest_file_result.base_url = ingest_request.base_url
+            AND ingest_file_result.ingest_type = ingest_request.ingest_type
+        WHERE
+            (ingest_request.ingest_type = 'pdf'
+                OR ingest_request.ingest_type = 'xml')
+            AND ingest_request.link_source = 'doaj'
+            -- AND date(ingest_request.created) > '2020-12-01'
+            AND (ingest_file_result.status IS NULL
+                OR ingest_file_result.status = 'no-capture')
+    ) TO '/grande/snapshots/doaj_noingest_2020-11-19.rows.json';
+    => COPY 3732543
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_noingest_2020-11-19.rows.json | pv -l | shuf > /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json
+    => 3.73M 0:02:18 [26.9k/s]
+
+Definitely some non-URL strings in there; should try to filter those out
+earlier in the transform process. And/or have a constraint on the URL column in
+the database.
+
+Enqueue the whole batch:
+
+    cat /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Started this batch off at 2020-11-19 18:10 (Pacific time)
+
+Stats after run:
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    ORDER BY ingest_request.ingest_type, COUNT DESC
+    LIMIT 30;
+
+## Dump Seedlist
+
+After preliminary bulk ingest attempts, dump rows:
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.base_url = ingest_request.base_url
+                AND ingest_file_result.ingest_type = ingest_request.ingest_type
+            WHERE
+                ingest_request.link_source = 'doaj'
+                AND (ingest_request.ingest_type = 'pdf'
+                    OR ingest_request.ingest_type = 'xml')
+                AND ingest_file_result.status != 'success'
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+        ) t1
+    ) TO '/grande/snapshots/doaj_seedlist_2020-11-19.rows.json';
+    => 1,899,555
+
+TODO: filter for valid URLs
+
+Prep ingest requests (for post-crawl use):
+
+    ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | pv -l > /grande/snapshots/doaj_crawl_ingest_2020-11-19.json
+
+And actually dump seedlist(s):
+
+    cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.url.txt
+    cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.terminal_url.txt
+    cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.no_terminal_url.txt
+
+    wc -l doaj_seedlist_2020-11-19.*.txt
+
+## Post-Crawl Ingest
+
+Re-run all ingests, from original batch (pdf, xml, and html), now that DOAJ
+identifiers are all in fatcat:
+
+    cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+    # started 2020-12-23 15:05 (Pacific)
+    # finished around 2020-12-31, after one long/slow partition
+
+Stats again after everything:
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    ORDER BY ingest_request.ingest_type, COUNT DESC
+    LIMIT 50;
+
+     ingest_type |          status          |  count  
+    -------------+--------------------------+---------
+     html        | wrong-scope              | 1089423
+     html        | no-capture               |  423917
+     html        | redirect-loop            |  212910
+     html        | unknown-scope            |  204069
+     html        | html-resource-no-capture |  165587
+     html        | success                  |  122937
+     html        | null-body                |  100296
+     html        | wayback-content-error    |   53918
+     html        | wrong-mimetype           |   18908
+     html        | terminal-bad-status      |   14059
+     html        | petabox-error            |   13520
+     html        | cdx-error                |    6823
+     html        | wayback-error            |     890
+     html        |                          |     620
+     html        | blocked-cookie           |     543
+     html        | blocked-captcha          |     250
+     html        | redirects-exceeded       |     135
+     html        | too-many-resources       |     111
+     html        | max-hops-exceeded        |      84
+     html        | bad-redirect             |       3
+     pdf         | success                  | 2851324
+     pdf         | no-pdf-link              |  529914
+     pdf         | redirect-loop            |  349494
+     pdf         | no-capture               |  272202
+     pdf         | null-body                |  129027
+     pdf         | terminal-bad-status      |   91796
+     pdf         | link-loop                |   25267
+     pdf         | wrong-mimetype           |    6504
+     pdf         | wayback-error            |    2968
+     pdf         |                          |    2068
+     pdf         | wayback-content-error    |    1548
+     pdf         | cdx-error                |    1095
+     pdf         | petabox-error            |    1024
+     pdf         | bad-redirect             |     203
+     pdf         | redirects-exceeded       |     135
+     pdf         | timeout                  |      20
+     pdf         | max-hops-exceeded        |      19
+     pdf         | bad-gzip-encoding        |       2
+     xml         | success                  |    6897
+     xml         | null-body                |    2353
+     xml         | wrong-mimetype           |     184
+     xml         | no-capture               |       5
+     xml         | cdx-error                |       3
+    (43 rows)
+
+
+And on filtered subset that we actually crawled:
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+        AND (ingest_request.ingest_type = 'pdf'
+            OR ingest_request.ingest_type = 'xml')
+        AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+        AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+        AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+        AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+        AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+        AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+        AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+        AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+        AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+        AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+        AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+        AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+        AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+        AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+        AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+        AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+        AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+        AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+        AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+        AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+    GROUP BY ingest_request.ingest_type, status
+    ORDER BY ingest_request.ingest_type, COUNT DESC
+    LIMIT 50;
+
+     ingest_type |        status         |  count  
+    -------------+-----------------------+---------
+     pdf         | success               | 2851286
+     pdf         | no-pdf-link           |  527495
+     pdf         | redirect-loop         |  345138
+     pdf         | no-capture            |  268140
+     pdf         | null-body             |  129027
+     pdf         | terminal-bad-status   |   91125
+     pdf         | link-loop             |   25267
+     pdf         | wrong-mimetype        |    6504
+     pdf         | wayback-error         |    2907
+     pdf         | petabox-error         |     363
+     pdf         | wayback-content-error |     242
+     pdf         | bad-redirect          |     203
+     pdf         | redirects-exceeded    |     135
+     pdf         | max-hops-exceeded     |      19
+     pdf         | cdx-error             |      15
+     pdf         | bad-gzip-encoding     |       2
+     xml         | success               |    6897
+     xml         | null-body             |    2353
+     xml         | wrong-mimetype        |     184
+     xml         | no-capture            |       5
+    (20 rows)
+
author	Bryan Newbold <bnewbold@archive.org>	2020-12-31 12:17:00 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-12-31 12:17:00 -0800
commit	b0d4ac285f101a641cee0e87da80d47a468c08aa (patch)
tree	f9aabc05b6ca43528c376b56ed3ae7351c15f2b4 /notes/ingest
parent	476fa2ff8c5e561287390505c17caf1888d6b9f4 (diff)
download	sandcrawler-b0d4ac285f101a641cee0e87da80d47a468c08aa.tar.gz sandcrawler-b0d4ac285f101a641cee0e87da80d47a468c08aa.zip