diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-26 19:31:49 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-26 19:31:52 -0700 |
commit | e96972ef531818eff610327a2a4c310a12ecdb14 (patch) | |
tree | 02c844b4e08505df5ec40a82792f0b30a1d5d642 /notes/ingest/2020-05_oai_pmh.md | |
parent | a75f85eb109358a0ef564688553c4e1e479b53df (diff) | |
download | sandcrawler-e96972ef531818eff610327a2a4c310a12ecdb14.tar.gz sandcrawler-e96972ef531818eff610327a2a4c310a12ecdb14.zip |
ingest notes
Diffstat (limited to 'notes/ingest/2020-05_oai_pmh.md')
-rw-r--r-- | notes/ingest/2020-05_oai_pmh.md | 38 |
1 files changed, 36 insertions, 2 deletions
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md index 4cfd8d5..37e7dfc 100644 --- a/notes/ingest/2020-05_oai_pmh.md +++ b/notes/ingest/2020-05_oai_pmh.md @@ -118,8 +118,42 @@ Dump ingest requests: AND ingest_request.link_source = 'oai' AND date(ingest_request.created) > '2020-05-01' AND ingest_file_result.status IS NULL - ) TO '/grande/snapshots/oai_noingest_20200506.requests.json'; + ) TO '/grande/snapshots/oai_noingest_20200506.rows.json'; => COPY 49491452 - cat /grande/snapshots/oai_noingest_20200506.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + WARNING: should have transformed from rows to requests here + + cat /grande/snapshots/oai_noingest_20200506.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Crawl and re-ingest + +Updated stats after ingest (NOTE: ingest requests not really formed correctly, +but doesn't matter because fatcat wasn't importing these anyways): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + +Dump again for crawling: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2020-05-01' + AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error') + ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json'; |