summer 2022 ingest notes

author: Bryan Newbold <bnewbold@archive.org> 2022-09-06 09:49:37 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-09-06 09:49:37 -0700
commit: 49836dba33bfb481213c03fe4f84c4d876d2d3bd (patch)
tree: 739b6744b12914e07951bd8186a83c55624e5268
parent: 8918b4106aa33d936f07df41ac0bdc65825e6ef4 (diff)
download: sandcrawler-49836dba33bfb481213c03fe4f84c4d876d2d3bd.tar.gz
sandcrawler-49836dba33bfb481213c03fe4f84c4d876d2d3bd.zip
3 files changed, 389 insertions, 0 deletions
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md
new file mode 100644
index 0000000..74aeb8d
--- /dev/null
+++ b/notes/ingest/2022-07-19_dblp.md
@@ -0,0 +1,50 @@
+
+Cross-posting from fatcat bulk metadata update/ingest.
+
+    zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # 631k 0:00:11 [54.0k/s]
+
+
+## Post-Crawl Stats
+
+This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run
+2022-09-06:
+
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'dblp'
+    GROUP BY ingest_request.ingest_type, status
+    -- ORDER BY ingest_request.ingest_type, COUNT DESC
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+
+     ingest_type |        status         | count  
+    -------------+-----------------------+--------
+     pdf         | success               | 305142
+     pdf         | no-pdf-link           | 192683
+     pdf         | no-capture            |  42634
+     pdf         | terminal-bad-status   |  38041
+     pdf         | skip-url-blocklist    |  31055
+     pdf         | link-loop             |   9263
+     pdf         | wrong-mimetype        |   4545
+     pdf         | redirect-loop         |   3952
+     pdf         | empty-blob            |   2705
+     pdf         | wayback-content-error |    834
+     pdf         | wayback-error         |    294
+     pdf         | petabox-error         |    202
+     pdf         | blocked-cookie        |    155
+     pdf         | cdx-error             |    115
+     pdf         | body-too-large        |     66
+     pdf         | bad-redirect          |     19
+     pdf         | timeout               |      7
+     pdf         | bad-gzip-encoding     |      4
+    (18 rows)
+
+That is quite a lot of `no-pdf-link`, might be worth doing a random sample
+and/or re-ingest. And a chunk of `no-capture` to retry.
diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md
new file mode 100644
index 0000000..7e55633
--- /dev/null
+++ b/notes/ingest/2022-07_doaj.md
@@ -0,0 +1,199 @@
+
+This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for
+heritrix bulk crawling, along with JALC and DOAJ URLs.
+
+    export SNAPSHOT=2022-07-20
+
+## Transform and Load
+
+    # on sandcrawler-vm
+    mkdir -p /srv/sandcrawler/tasks/doaj
+    cd /srv/sandcrawler/tasks/doaj
+    wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz"
+
+    # in pipenv, in python directory
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz
+    # 9.72M 0:36:28 [4.44k/s]
+
+    zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request -
+    # 9.72M 0:17:04 [9.49k/s]
+    # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0})
+    # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097})
+
+Stats after this load:
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    -- next time include ingest_type in sort
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 3165539
+     pdf         |                          | 2078874
+     html        |                          | 1547698
+     html        | wrong-scope              | 1114332
+     pdf         | no-pdf-link              |  517261
+     html        | success                  |  388376
+     html        | unknown-scope            |  242044
+     pdf         | no-capture               |  179030
+     pdf         | terminal-bad-status      |  174741
+     html        | no-capture               |  155323
+     pdf         | null-body                |  129267
+     pdf         | redirect-loop            |  127136
+     html        | html-resource-no-capture |  117275
+     html        | null-body                |  100296
+     pdf         | blocked-cookie           |   71093
+     html        | redirect-loop            |   65519
+     html        | terminal-bad-status      |   64856
+     html        | blocked-cookie           |   64095
+     html        | spn2-backoff             |   55173
+     pdf         | link-loop                |   27440
+     html        | wrong-mimetype           |   26016
+     html        | wayback-content-error    |   20109
+     xml         |                          |   13624
+     pdf         | wrong-mimetype           |    8411
+     xml         | success                  |    6899
+     html        | petabox-error            |    6199
+     html        | wayback-error            |    5269
+     html        | spn2-cdx-lookup-failure  |    4635
+     html        | spn2-recent-capture      |    4527
+     xml         | null-body                |    2353
+    (30 rows)
+
+## Bulk Ingest
+
+    COPY (
+        SELECT row_to_json(t1.*)
+        FROM (
+            SELECT ingest_request.*, ingest_file_result as result
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.base_url = ingest_request.base_url
+                AND ingest_file_result.ingest_type = ingest_request.ingest_type
+            WHERE
+                ingest_request.link_source = 'doaj'
+                -- AND (ingest_request.ingest_type = 'pdf'
+                --    OR ingest_request.ingest_type = 'xml')
+                AND (
+                    ingest_file_result.status IS NULL
+                    OR ingest_file_result.status = 'no-capture'
+                )
+                AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_request.base_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_request.base_url NOT LIKE '%ahajournals.org%'
+                AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_request.base_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_request.base_url NOT LIKE '%academic.oup.com%'
+                AND ingest_request.base_url NOT LIKE '%tandfonline.com%'
+                AND ingest_request.base_url NOT LIKE '%://archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://web.archive.org/%'
+                AND ingest_request.base_url NOT LIKE '%://www.archive.org/%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%'
+        ) t1
+    ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json';
+    # COPY 3962331
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json
+    # 3.96M 0:01:47 [36.7k/s]
+
+Top domains:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20
+     789988 www.mdpi.com
+     318142 www.frontiersin.org
+     226316 link.springer.com
+     204429 www.scielo.br
+     201175 www.sciencedirect.com
+      72852 ieeexplore.ieee.org
+      68983 dx.doi.org
+      33286 www.dovepress.com
+      26020 elifesciences.org
+      23838 www.cetjournal.it
+      21102 mab-online.nl
+      20242 www.revistas.usp.br
+      16564 periodicos.uem.br
+      15710 journals.openedition.org
+      14514 dergipark.org.tr
+      14072 apcz.umk.pl
+      13924 ojs.minions.amsterdam
+      13717 bmgn-lchr.nl
+      13512 ojstest.minions.amsterdam
+      10440 journals.asm.org
+
+Bulk ingest:
+
+    cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+    # Done
+
+## Stats Again
+
+    SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*)
+    FROM ingest_request
+    LEFT JOIN ingest_file_result
+        ON ingest_file_result.ingest_type = ingest_request.ingest_type
+        AND ingest_file_result.base_url = ingest_request.base_url
+    WHERE 
+        ingest_request.link_source = 'doaj'
+    GROUP BY ingest_request.ingest_type, status
+    -- ORDER BY ingest_request.ingest_type, COUNT DESC
+    ORDER BY COUNT DESC
+    LIMIT 30;
+
+
+     ingest_type |          status          |  count
+    -------------+--------------------------+---------
+     pdf         | success                  | 4704006
+     html        | wrong-scope              | 1761227
+     html        | success                  |  778165
+     pdf         | no-pdf-link              |  759805
+     html        | no-capture               |  382080
+     html        | unknown-scope            |  313391
+     html        | html-resource-no-capture |  292953
+     pdf         | no-capture               |  290311
+     pdf         | terminal-bad-status      |  271776
+     pdf         | null-body                |  129267
+     pdf         | blocked-cookie           |  108491
+     html        | terminal-bad-status      |  103014
+     html        | null-body                |  100296
+     html        | blocked-cookie           |   88533
+     pdf         |                          |   81517
+     pdf         | skip-url-blocklist       |   76443
+     html        | spn2-backoff             |   50615
+     pdf         | link-loop                |   45516
+     html        | wrong-mimetype           |   33525
+     html        | wayback-content-error    |   25535
+     pdf         | empty-blob               |   21431
+     pdf         | redirect-loop            |   19795
+     html        | petabox-error            |   18291
+     html        | empty-blob               |   14391
+     pdf         | wrong-mimetype           |   14084
+     html        | redirect-loop            |   12856
+     xml         | success                  |   10381
+     xml         | no-capture               |   10008
+     html        | skip-url-blocklist       |    3294
+     html        | cdx-error                |    3275
+    (30 rows)
+
+Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k
+PDFs with no attempt at all? Maybe a filter, or bogus URLs.
+
+Over 1.5M new PDF success over this crawl iteration period, nice.
diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md
new file mode 100644
index 0000000..415f23b
--- /dev/null
+++ b/notes/ingest/2022-07_targeted.md
@@ -0,0 +1,140 @@
+
+Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs.
+
+    export PATCHDATE=2022-07-29
+    export CRAWLVM=wbgrp-svc279.us.archive.org
+    export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07
+
+## Seedlist Query
+
+Terminal URLs dump:
+
+    COPY (
+        SELECT row_to_json(t) FROM (
+            SELECT ingest_file_result.terminal_url, ingest_request.*
+            FROM ingest_request
+            LEFT JOIN ingest_file_result
+                ON ingest_file_result.ingest_type = ingest_request.ingest_type
+                AND ingest_file_result.base_url = ingest_request.base_url
+            WHERE
+                (
+                    ingest_request.ingest_type = 'pdf'
+                    OR ingest_request.ingest_type = 'html'
+                )
+                -- AND ingest_file_result.updated >= '2022-01-12'
+                AND (
+                    ingest_file_result.status = 'no-capture'
+                    OR ingest_file_result.status = 'cdx-error'
+                    OR ingest_file_result.status = 'wayback-error'
+                    OR ingest_file_result.status = 'wayback-content-error'
+                    OR ingest_file_result.status = 'petabox-error'
+                    OR ingest_file_result.status LIKE 'spn2-%'
+                    OR ingest_file_result.status = 'gateway-timeout'
+                    OR (
+                        ingest_file_result.status = 'terminal-bad-status'
+                        AND (
+                            ingest_file_result.terminal_status_code = 500
+                            OR ingest_file_result.terminal_status_code = 502
+                            OR ingest_file_result.terminal_status_code = 503
+                            OR ingest_file_result.terminal_status_code = 429
+                        )
+                    )
+                )
+                AND (
+                    ingest_request.link_source = 'doi'
+                    OR ingest_request.link_source = 'doaj'
+                    OR ingest_request.link_source = 'dblp'
+                    OR ingest_request.link_source = 'arxiv'
+                    OR ingest_request.link_source = 'pmc'
+                    -- OR ingest_request.link_source = 'unpaywall'
+                    -- OR ingest_request.link_source = 'oai'
+                )
+
+                AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%'
+                AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%'
+                -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%'
+
+                AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%'
+                AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%'
+                AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%'
+
+                -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%'
+                AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%'
+                -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%'
+                AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%'
+        ) t
+    ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json';
+    => COPY 3524573
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \
+        | rg -v "\\\\" \
+        | jq -r .terminal_url \
+        | rg '://' \
+        | rg -i '^http' \
+        | rg -v '://10\.' \
+        | rg -v '://172\.' \
+        | sort -u -S 4G \
+        | pv -l \
+        > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt
+    => 3.11M 0:01:08 [45.4k/s]
+
+    # check top domains
+    cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25
+     624948 doi.org
+     382492 www.jstage.jst.go.jp
+     275087 www.mdpi.com
+     157134 www.persee.fr
+     108979 www.sciencedirect.com
+      94375 www.scielo.br
+      50834 onlinelibrary.wiley.com
+      49991 journals.lww.com
+      30354 www.frontiersin.org
+      27963 doaj.org
+      27058 www.e-periodica.ch
+      24147 dl.acm.org
+      23389 aclanthology.org
+      22086 www.research-collection.ethz.ch
+      21589 medien.die-bonn.de
+      18866 www.ingentaconnect.com
+      18583 doi.nrct.go.th
+      18271 repositories.lib.utexas.edu
+      17634 hdl.handle.net
+      16366 archives.datapages.com
+      15146 cgscholar.com
+      13987 dl.gi.de
+      13188 www.degruyter.com
+      12503 ethos.bl.uk
+      12304 preprints.jmir.org
+
+    cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule
+    => done
+
+    scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp
+    ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/
+
+
+## Re-Ingest
+
+Transform:
+
+    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json
+    => 3.52M 0:01:37 [36.2k/s]
+
+Ingest:
+
+    cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
author	Bryan Newbold <bnewbold@archive.org>	2022-09-06 09:49:37 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-09-06 09:49:37 -0700
commit	49836dba33bfb481213c03fe4f84c4d876d2d3bd (patch)
tree	739b6744b12914e07951bd8186a83c55624e5268
parent	8918b4106aa33d936f07df41ac0bdc65825e6ef4 (diff)
download	sandcrawler-49836dba33bfb481213c03fe4f84c4d876d2d3bd.tar.gz sandcrawler-49836dba33bfb481213c03fe4f84c4d876d2d3bd.zip