aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-02-08 17:47:59 -0800
committerBryan Newbold <bnewbold@archive.org>2022-02-08 17:47:59 -0800
commit067c97a59a4a8728add7b9e561082a5403be52e5 (patch)
tree1d425c967ab78f696c861195954431b47a00d3d5 /notes/ingest
parentb76630806d43427ddcdef8cd6efd110c731668a5 (diff)
downloadsandcrawler-067c97a59a4a8728add7b9e561082a5403be52e5.tar.gz
sandcrawler-067c97a59a4a8728add7b9e561082a5403be52e5.zip
OAI-PMH patch crawl more updates
Diffstat (limited to 'notes/ingest')
-rw-r--r--notes/ingest/2021-09-03_patch_crawl.md73
1 files changed, 71 insertions, 2 deletions
diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md
index ad69311..e664c16 100644
--- a/notes/ingest/2021-09-03_patch_crawl.md
+++ b/notes/ingest/2021-09-03_patch_crawl.md
@@ -566,9 +566,78 @@ requests, slightly updated to allow `https://doi.org/10*` in terminal URL:
=> 4,488,193
./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json
+ => DONE
cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => TIMEDOUT
+ => (probably due to re-assignment)
+ => TODO: try again
+
+## Stats Again (just OAI-PMH)
+
+OAI-PMH query:
-## Stats Again
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repec:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%'
+ AND ingest_request.base_url NOT LIKE '%www.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%'
+ AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%'
+ AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%'
+ AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%'
+ AND ingest_request.base_url NOT LIKE '%doaj.org%'
+ AND ingest_request.base_url NOT LIKE '%orcid.org%'
+ AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%'
+ AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+On 2022-02-08:
+
+ status | count
+ -----------------------+----------
+ success | 13505143
+ no-pdf-link | 8741007
+ no-capture | 4429986
+ redirect-loop | 1566611
+ terminal-bad-status | 816162
+ link-loop | 459006
+ wrong-mimetype | 448983
+ null-body | 71871
+ cdx-error | 19055
+ | 15275
+ petabox-error | 11713
+ blocked-cookie | 11664
+ wayback-error | 8745
+ skip-url-blocklist | 7828
+ max-hops-exceeded | 2031
+ wayback-content-error | 338
+ body-too-large | 280
+ spn2-error:job-failed | 191
+ bad-redirect | 134
+ redirects-exceeded | 120
+ (20 rows)
-Re-run the "progress check" stuff from above