diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-01-05 11:54:49 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-01-05 11:54:49 -0800 |
commit | 65a0d38bcedca0610ca6fa8e053199f324062ace (patch) | |
tree | cb210178e213f0384698df38f02c862795cc2263 /notes | |
parent | e4c153d4f6d8c815c7e792fec2f40cdac39bab35 (diff) | |
download | sandcrawler-65a0d38bcedca0610ca6fa8e053199f324062ace.tar.gz sandcrawler-65a0d38bcedca0610ca6fa8e053199f324062ace.zip |
document progress on re-GROBID-ing
Diffstat (limited to 'notes')
-rw-r--r-- | notes/tasks/2021-12-06_regrobid.md | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/notes/tasks/2021-12-06_regrobid.md b/notes/tasks/2021-12-06_regrobid.md index 65e9fe3..d879277 100644 --- a/notes/tasks/2021-12-06_regrobid.md +++ b/notes/tasks/2021-12-06_regrobid.md @@ -191,6 +191,82 @@ And some earlier files of interest on `aitio`: | pv -l \ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.unextracted -p -1 + +## Ancient Fatcat Files + +Files from an era where we didn't record GROBID version or status, even for +success. + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + grobid.status_code = 200 + AND grobid.status IS NULL + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json' + WITH NULL ''; + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_status_null.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + # 107k 0:00:03 [29.9k/s] + + +## Start Re-Processing Old GROBID Versions + + COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE + grobid.status = 'success' + AND grobid.grobid_version NOT LIKE '0.7.%' + AND cdx.sha1hex IS NOT NULL + AND fatcat_file.sha1hex IS NOT NULL + -- sort of arbitary "not recently" date filter + AND (grobid.updated IS NULL OR grobid.updated < '2021-11-15') + -- LIMIT 5; + ) + TO '/srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json' + WITH NULL ''; + +This one is huge, and want to process in batches/chunks of ~8 million at a time. + + cd /srv/sandcrawler/tasks/ + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.json \ + | split --lines 5000000 - ungrobided_fatcat.2021-12-11.grobid_old.split_ -d --additional-suffix .json + +Submit individual batches like: + + cat /srv/sandcrawler/tasks/ungrobided_fatcat.2021-12-11.grobid_old.split_01.json \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ungrobided-pg -p -1 + +Overall progress: + + x ungrobided_fatcat.2021-12-11.grobid_old.split_00.json + . ungrobided_fatcat.2021-12-11.grobid_old.split_01.json + => ungrobided_fatcat.2021-12-11.grobid_old.split_02.json + => ungrobided_fatcat.2021-12-11.grobid_old.split_03.json + => ungrobided_fatcat.2021-12-11.grobid_old.split_04.json + => ungrobided_fatcat.2021-12-11.grobid_old.split_05.json + => ungrobided_fatcat.2021-12-11.grobid_old.split_06.json + => ungrobided_fatcat.2021-12-11.grobid_old.split_07.json + => ungrobided_fatcat.2021-12-11.grobid_old.split_08.json (small) + ## General Counts How many fatcat files of what mimetype (reported in sandcrawler-db)? @@ -287,3 +363,16 @@ What are the GROBID status codes for fatcat files? Narrowed down: error | 200 | 3 (7 rows) +Ran the same query again on 2021-12-15: + + status | status_code | count + ----------------+-------------+---------- + success | 200 | 45092915 + error | 500 | 302373 + | | 250335 + | 200 | 53352 + bad-grobid-xml | 200 | 39 + error-timeout | -4 | 37 + error | 200 | 34 + error | 503 | 2 + (8 rows) |