diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-09-06 09:49:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-09-06 09:49:37 -0700 |
commit | 49836dba33bfb481213c03fe4f84c4d876d2d3bd (patch) | |
tree | 739b6744b12914e07951bd8186a83c55624e5268 | |
parent | 8918b4106aa33d936f07df41ac0bdc65825e6ef4 (diff) | |
download | sandcrawler-49836dba33bfb481213c03fe4f84c4d876d2d3bd.tar.gz sandcrawler-49836dba33bfb481213c03fe4f84c4d876d2d3bd.zip |
summer 2022 ingest notes
-rw-r--r-- | notes/ingest/2022-07-19_dblp.md | 50 | ||||
-rw-r--r-- | notes/ingest/2022-07_doaj.md | 199 | ||||
-rw-r--r-- | notes/ingest/2022-07_targeted.md | 140 |
3 files changed, 389 insertions, 0 deletions
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md new file mode 100644 index 0000000..74aeb8d --- /dev/null +++ b/notes/ingest/2022-07-19_dblp.md @@ -0,0 +1,50 @@ + +Cross-posting from fatcat bulk metadata update/ingest. + + zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 631k 0:00:11 [54.0k/s] + + +## Post-Crawl Stats + +This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run +2022-09-06: + + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'dblp' + GROUP BY ingest_request.ingest_type, status + -- ORDER BY ingest_request.ingest_type, COUNT DESC + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+-----------------------+-------- + pdf | success | 305142 + pdf | no-pdf-link | 192683 + pdf | no-capture | 42634 + pdf | terminal-bad-status | 38041 + pdf | skip-url-blocklist | 31055 + pdf | link-loop | 9263 + pdf | wrong-mimetype | 4545 + pdf | redirect-loop | 3952 + pdf | empty-blob | 2705 + pdf | wayback-content-error | 834 + pdf | wayback-error | 294 + pdf | petabox-error | 202 + pdf | blocked-cookie | 155 + pdf | cdx-error | 115 + pdf | body-too-large | 66 + pdf | bad-redirect | 19 + pdf | timeout | 7 + pdf | bad-gzip-encoding | 4 + (18 rows) + +That is quite a lot of `no-pdf-link`, might be worth doing a random sample +and/or re-ingest. And a chunk of `no-capture` to retry. diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md new file mode 100644 index 0000000..7e55633 --- /dev/null +++ b/notes/ingest/2022-07_doaj.md @@ -0,0 +1,199 @@ + +This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for +heritrix bulk crawling, along with JALC and DOAJ URLs. + + export SNAPSHOT=2022-07-20 + +## Transform and Load + + # on sandcrawler-vm + mkdir -p /srv/sandcrawler/tasks/doaj + cd /srv/sandcrawler/tasks/doaj + wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz" + + # in pipenv, in python directory + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz + # 9.72M 0:36:28 [4.44k/s] + + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request - + # 9.72M 0:17:04 [9.49k/s] + # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0}) + # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097}) + +Stats after this load: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3165539 + pdf | | 2078874 + html | | 1547698 + html | wrong-scope | 1114332 + pdf | no-pdf-link | 517261 + html | success | 388376 + html | unknown-scope | 242044 + pdf | no-capture | 179030 + pdf | terminal-bad-status | 174741 + html | no-capture | 155323 + pdf | null-body | 129267 + pdf | redirect-loop | 127136 + html | html-resource-no-capture | 117275 + html | null-body | 100296 + pdf | blocked-cookie | 71093 + html | redirect-loop | 65519 + html | terminal-bad-status | 64856 + html | blocked-cookie | 64095 + html | spn2-backoff | 55173 + pdf | link-loop | 27440 + html | wrong-mimetype | 26016 + html | wayback-content-error | 20109 + xml | | 13624 + pdf | wrong-mimetype | 8411 + xml | success | 6899 + html | petabox-error | 6199 + html | wayback-error | 5269 + html | spn2-cdx-lookup-failure | 4635 + html | spn2-recent-capture | 4527 + xml | null-body | 2353 + (30 rows) + +## Bulk Ingest + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json'; + # COPY 3962331 + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json + # 3.96M 0:01:47 [36.7k/s] + +Top domains: + + cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20 + 789988 www.mdpi.com + 318142 www.frontiersin.org + 226316 link.springer.com + 204429 www.scielo.br + 201175 www.sciencedirect.com + 72852 ieeexplore.ieee.org + 68983 dx.doi.org + 33286 www.dovepress.com + 26020 elifesciences.org + 23838 www.cetjournal.it + 21102 mab-online.nl + 20242 www.revistas.usp.br + 16564 periodicos.uem.br + 15710 journals.openedition.org + 14514 dergipark.org.tr + 14072 apcz.umk.pl + 13924 ojs.minions.amsterdam + 13717 bmgn-lchr.nl + 13512 ojstest.minions.amsterdam + 10440 journals.asm.org + +Bulk ingest: + + cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # Done + +## Stats Again + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- ORDER BY ingest_request.ingest_type, COUNT DESC + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 4704006 + html | wrong-scope | 1761227 + html | success | 778165 + pdf | no-pdf-link | 759805 + html | no-capture | 382080 + html | unknown-scope | 313391 + html | html-resource-no-capture | 292953 + pdf | no-capture | 290311 + pdf | terminal-bad-status | 271776 + pdf | null-body | 129267 + pdf | blocked-cookie | 108491 + html | terminal-bad-status | 103014 + html | null-body | 100296 + html | blocked-cookie | 88533 + pdf | | 81517 + pdf | skip-url-blocklist | 76443 + html | spn2-backoff | 50615 + pdf | link-loop | 45516 + html | wrong-mimetype | 33525 + html | wayback-content-error | 25535 + pdf | empty-blob | 21431 + pdf | redirect-loop | 19795 + html | petabox-error | 18291 + html | empty-blob | 14391 + pdf | wrong-mimetype | 14084 + html | redirect-loop | 12856 + xml | success | 10381 + xml | no-capture | 10008 + html | skip-url-blocklist | 3294 + html | cdx-error | 3275 + (30 rows) + +Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k +PDFs with no attempt at all? Maybe a filter, or bogus URLs. + +Over 1.5M new PDF success over this crawl iteration period, nice. diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md new file mode 100644 index 0000000..415f23b --- /dev/null +++ b/notes/ingest/2022-07_targeted.md @@ -0,0 +1,140 @@ + +Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs. + + export PATCHDATE=2022-07-29 + export CRAWLVM=wbgrp-svc279.us.archive.org + export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07 + +## Seedlist Query + +Terminal URLs dump: + + COPY ( + SELECT row_to_json(t) FROM ( + SELECT ingest_file_result.terminal_url, ingest_request.* + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + -- AND ingest_file_result.updated >= '2022-01-12' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status LIKE 'spn2-%' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + OR ingest_file_result.terminal_status_code = 429 + ) + ) + ) + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'dblp' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'pmc' + -- OR ingest_request.link_source = 'unpaywall' + -- OR ingest_request.link_source = 'oai' + ) + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + ) t + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json'; + => COPY 3524573 + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \ + | rg -v "\\\\" \ + | jq -r .terminal_url \ + | rg '://' \ + | rg -i '^http' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt + => 3.11M 0:01:08 [45.4k/s] + + # check top domains + cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25 + 624948 doi.org + 382492 www.jstage.jst.go.jp + 275087 www.mdpi.com + 157134 www.persee.fr + 108979 www.sciencedirect.com + 94375 www.scielo.br + 50834 onlinelibrary.wiley.com + 49991 journals.lww.com + 30354 www.frontiersin.org + 27963 doaj.org + 27058 www.e-periodica.ch + 24147 dl.acm.org + 23389 aclanthology.org + 22086 www.research-collection.ethz.ch + 21589 medien.die-bonn.de + 18866 www.ingentaconnect.com + 18583 doi.nrct.go.th + 18271 repositories.lib.utexas.edu + 17634 hdl.handle.net + 16366 archives.datapages.com + 15146 cgscholar.com + 13987 dl.gi.de + 13188 www.degruyter.com + 12503 ethos.bl.uk + 12304 preprints.jmir.org + + cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule + => done + + scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp + ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/ + + +## Re-Ingest + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json + => 3.52M 0:01:37 [36.2k/s] + +Ingest: + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 |