From 4f02b47f57364195e7302ec80565ce51fd20048d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 1 Dec 2021 19:06:33 -0800 Subject: update fatcat_file SQL table schema, and add backfill notes --- sql/backfill/backfill.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'sql/backfill') diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md index f1a5f86..4a56065 100644 --- a/sql/backfill/backfill.md +++ b/sql/backfill/backfill.md @@ -76,6 +76,19 @@ In psql: COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); # => COPY 24727350 +In 2021-11-26: + + zcat file_export.json.gz \ + | pv -l \ + | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0], (.urls|length >= 1), .content_scope] | @tsv' \ + | sort -S 8G \ + | uniq -w 40 \ + | pigz \ + > fatcat_file.2021-11-26.tsv.gz + + COPY fatcat_file FROM '/srv/sandcrawler/tasks/fatcat_file.2021-11-26.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 112086814 + ## `file_meta` zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv -- cgit v1.2.3