diff options
Diffstat (limited to 'notes/ingest/2022-07-19_dblp.md')
-rw-r--r-- | notes/ingest/2022-07-19_dblp.md | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md new file mode 100644 index 0000000..74aeb8d --- /dev/null +++ b/notes/ingest/2022-07-19_dblp.md @@ -0,0 +1,50 @@ + +Cross-posting from fatcat bulk metadata update/ingest. + + zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 631k 0:00:11 [54.0k/s] + + +## Post-Crawl Stats + +This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run +2022-09-06: + + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'dblp' + GROUP BY ingest_request.ingest_type, status + -- ORDER BY ingest_request.ingest_type, COUNT DESC + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+-----------------------+-------- + pdf | success | 305142 + pdf | no-pdf-link | 192683 + pdf | no-capture | 42634 + pdf | terminal-bad-status | 38041 + pdf | skip-url-blocklist | 31055 + pdf | link-loop | 9263 + pdf | wrong-mimetype | 4545 + pdf | redirect-loop | 3952 + pdf | empty-blob | 2705 + pdf | wayback-content-error | 834 + pdf | wayback-error | 294 + pdf | petabox-error | 202 + pdf | blocked-cookie | 155 + pdf | cdx-error | 115 + pdf | body-too-large | 66 + pdf | bad-redirect | 19 + pdf | timeout | 7 + pdf | bad-gzip-encoding | 4 + (18 rows) + +That is quite a lot of `no-pdf-link`, might be worth doing a random sample +and/or re-ingest. And a chunk of `no-capture` to retry. |