diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-01 19:52:06 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-01 19:52:06 -0700 |
commit | 9753876b85c767a9848467065b4d4dd613d5ed68 (patch) | |
tree | 932d8d38c1ac9ff5513057a62fc637d96cc24ebf | |
parent | 302489c3691c5d4285d1d62fdb99ef05d5f79acf (diff) | |
download | sandcrawler-9753876b85c767a9848467065b4d4dd613d5ed68.tar.gz sandcrawler-9753876b85c767a9848467065b4d4dd613d5ed68.zip |
notes on file_meta task (from august)
-rw-r--r-- | notes/tasks/2020-08-20_file_meta.md | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/notes/tasks/2020-08-20_file_meta.md b/notes/tasks/2020-08-20_file_meta.md new file mode 100644 index 0000000..39c84dd --- /dev/null +++ b/notes/tasks/2020-08-20_file_meta.md @@ -0,0 +1,66 @@ + +Want to update fatcat file entities with "full" file metadata for those which are missing it. + +How many `file_meta` rows *still* don't have metadata? + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + => 62962 + +First generate list of sha1hex from most recent bulk export which are missing +at least some metadata (based on missing sha256): + + zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv + => 18.7M 0:05:46 [53.8k/s] + +Then dump the entire sandcrawler `file_meta` table as TSV, with first column +sha1hex and second column JSON with all the file metadata fields: + + COPY ( + SELECT sha1hex, row_to_json(file_meta) + FROM file_meta + WHERE sha256hex IS NOT NULL + ORDER BY sha1hex ASC + ) + TO '/grande/snapshots/file_meta_dump.tsv' + WITH NULL ''; + +Join/cut: + + export LC_ALL=C + join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json + => 18.1M 0:03:37 [83.2k/s] + +Check counts: + + cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l + => 18135313 + + zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr + 18103860 application/pdf + 29977 application/octet-stream + 876 text/html + 199 application/postscript + 171 application/gzip + 84 text/plain + 48 application/xml + 38 application/vnd.ms-powerpoint + 16 application/msword + 8 application/vnd.openxmlformats-officedocument.wordprocessingml.document + 6 image/jpeg + 4 message/rfc822 + 4 application/zip + 4 application/vnd.openxmlformats-officedocument.presentationml.presentation + 3 text/x-tex + 3 application/x-dosexec + 2 application/x-tar + 2 application/vnd.ms-tnef + 1 video/mpeg + 1 image/tiff + 1 image/svg+xml + 1 image/png + 1 image/gif + 1 audio/x-ape + 1 application/vnd.ms-office + 1 application/CDFV2-unknown + +TODO: fatcat importer |