diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-12-01 19:06:33 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-12-01 19:06:33 -0800 |
commit | 4f02b47f57364195e7302ec80565ce51fd20048d (patch) | |
tree | b803b35a6f37a2d7c258e34f06c7cd00f347d2e2 /sql | |
parent | 85a9c9008ab66680047fb151996c55566d56cbe3 (diff) | |
download | sandcrawler-4f02b47f57364195e7302ec80565ce51fd20048d.tar.gz sandcrawler-4f02b47f57364195e7302ec80565ce51fd20048d.zip |
update fatcat_file SQL table schema, and add backfill notes
Diffstat (limited to 'sql')
-rw-r--r-- | sql/backfill/backfill.md | 13 |
1 files changed, 13 insertions, 0 deletions
diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md index f1a5f86..4a56065 100644 --- a/sql/backfill/backfill.md +++ b/sql/backfill/backfill.md @@ -76,6 +76,19 @@ In psql: COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); # => COPY 24727350 +In 2021-11-26: + + zcat file_export.json.gz \ + | pv -l \ + | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0], (.urls|length >= 1), .content_scope] | @tsv' \ + | sort -S 8G \ + | uniq -w 40 \ + | pigz \ + > fatcat_file.2021-11-26.tsv.gz + + COPY fatcat_file FROM '/srv/sandcrawler/tasks/fatcat_file.2021-11-26.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 112086814 + ## `file_meta` zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv |