aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest/2020-05_oai_pmh.md
diff options
context:
space:
mode:
Diffstat (limited to 'notes/ingest/2020-05_oai_pmh.md')
-rw-r--r--notes/ingest/2020-05_oai_pmh.md38
1 files changed, 36 insertions, 2 deletions
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
index 4cfd8d5..37e7dfc 100644
--- a/notes/ingest/2020-05_oai_pmh.md
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -118,8 +118,42 @@ Dump ingest requests:
AND ingest_request.link_source = 'oai'
AND date(ingest_request.created) > '2020-05-01'
AND ingest_file_result.status IS NULL
- ) TO '/grande/snapshots/oai_noingest_20200506.requests.json';
+ ) TO '/grande/snapshots/oai_noingest_20200506.rows.json';
=> COPY 49491452
- cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ WARNING: should have transformed from rows to requests here
+
+ cat /grande/snapshots/oai_noingest_20200506.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Crawl and re-ingest
+
+Updated stats after ingest (NOTE: ingest requests not really formed correctly,
+but doesn't matter because fatcat wasn't importing these anyways):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+Dump again for crawling:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error')
+ ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json';