aboutsummaryrefslogtreecommitdiffstats
path: root/notes
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-01 19:52:06 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-01 19:52:06 -0700
commit9753876b85c767a9848467065b4d4dd613d5ed68 (patch)
tree932d8d38c1ac9ff5513057a62fc637d96cc24ebf /notes
parent302489c3691c5d4285d1d62fdb99ef05d5f79acf (diff)
downloadsandcrawler-9753876b85c767a9848467065b4d4dd613d5ed68.tar.gz
sandcrawler-9753876b85c767a9848467065b4d4dd613d5ed68.zip
notes on file_meta task (from august)
Diffstat (limited to 'notes')
-rw-r--r--notes/tasks/2020-08-20_file_meta.md66
1 files changed, 66 insertions, 0 deletions
diff --git a/notes/tasks/2020-08-20_file_meta.md b/notes/tasks/2020-08-20_file_meta.md
new file mode 100644
index 0000000..39c84dd
--- /dev/null
+++ b/notes/tasks/2020-08-20_file_meta.md
@@ -0,0 +1,66 @@
+
+Want to update fatcat file entities with "full" file metadata for those which are missing it.
+
+How many `file_meta` rows *still* don't have metadata?
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+ => 62962
+
+First generate list of sha1hex from most recent bulk export which are missing
+at least some metadata (based on missing sha256):
+
+ zcat file_hashes.tsv.gz | rg '\t\t' | cut -f3 | sort -u -S 4G | pv -l > fatcat_file_partial_sha1hex.tsv
+ => 18.7M 0:05:46 [53.8k/s]
+
+Then dump the entire sandcrawler `file_meta` table as TSV, with first column
+sha1hex and second column JSON with all the file metadata fields:
+
+ COPY (
+ SELECT sha1hex, row_to_json(file_meta)
+ FROM file_meta
+ WHERE sha256hex IS NOT NULL
+ ORDER BY sha1hex ASC
+ )
+ TO '/grande/snapshots/file_meta_dump.tsv'
+ WITH NULL '';
+
+Join/cut:
+
+ export LC_ALL=C
+ join -t$'\t' fatcat_file_partial_sha1hex.tsv /grande/snapshots/file_meta_dump.tsv | uniq -w 40 | cut -f2 | pv -l > fatcat_file_partial.file_meta.json
+ => 18.1M 0:03:37 [83.2k/s]
+
+Check counts:
+
+ cat fatcat_file_partial.file_meta.json | jq .sha1hex -r | sort -u -S 4G | wc -l
+ => 18135313
+
+ zcat fatcat_file_partial.file_meta.json.gz | jq .mimetype -r | sort -S 4G | uniq -c | sort -nr
+ 18103860 application/pdf
+ 29977 application/octet-stream
+ 876 text/html
+ 199 application/postscript
+ 171 application/gzip
+ 84 text/plain
+ 48 application/xml
+ 38 application/vnd.ms-powerpoint
+ 16 application/msword
+ 8 application/vnd.openxmlformats-officedocument.wordprocessingml.document
+ 6 image/jpeg
+ 4 message/rfc822
+ 4 application/zip
+ 4 application/vnd.openxmlformats-officedocument.presentationml.presentation
+ 3 text/x-tex
+ 3 application/x-dosexec
+ 2 application/x-tar
+ 2 application/vnd.ms-tnef
+ 1 video/mpeg
+ 1 image/tiff
+ 1 image/svg+xml
+ 1 image/png
+ 1 image/gif
+ 1 audio/x-ape
+ 1 application/vnd.ms-office
+ 1 application/CDFV2-unknown
+
+TODO: fatcat importer