aboutsummaryrefslogtreecommitdiffstats
path: root/sql
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-01 19:06:33 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-01 19:06:33 -0800
commit4f02b47f57364195e7302ec80565ce51fd20048d (patch)
treeb803b35a6f37a2d7c258e34f06c7cd00f347d2e2 /sql
parent85a9c9008ab66680047fb151996c55566d56cbe3 (diff)
downloadsandcrawler-4f02b47f57364195e7302ec80565ce51fd20048d.tar.gz
sandcrawler-4f02b47f57364195e7302ec80565ce51fd20048d.zip
update fatcat_file SQL table schema, and add backfill notes
Diffstat (limited to 'sql')
-rw-r--r--sql/backfill/backfill.md13
1 files changed, 13 insertions, 0 deletions
diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md
index f1a5f86..4a56065 100644
--- a/sql/backfill/backfill.md
+++ b/sql/backfill/backfill.md
@@ -76,6 +76,19 @@ In psql:
COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
# => COPY 24727350
+In 2021-11-26:
+
+ zcat file_export.json.gz \
+ | pv -l \
+ | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0], (.urls|length >= 1), .content_scope] | @tsv' \
+ | sort -S 8G \
+ | uniq -w 40 \
+ | pigz \
+ > fatcat_file.2021-11-26.tsv.gz
+
+ COPY fatcat_file FROM '/srv/sandcrawler/tasks/fatcat_file.2021-11-26.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 112086814
+
## `file_meta`
zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv