aboutsummaryrefslogtreecommitdiffstats
path: root/sql/backfill/backfill.md
diff options
context:
space:
mode:
Diffstat (limited to 'sql/backfill/backfill.md')
-rw-r--r--sql/backfill/backfill.md13
1 files changed, 13 insertions, 0 deletions
diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md
index f1a5f86..4a56065 100644
--- a/sql/backfill/backfill.md
+++ b/sql/backfill/backfill.md
@@ -76,6 +76,19 @@ In psql:
COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
# => COPY 24727350
+In 2021-11-26:
+
+ zcat file_export.json.gz \
+ | pv -l \
+ | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0], (.urls|length >= 1), .content_scope] | @tsv' \
+ | sort -S 8G \
+ | uniq -w 40 \
+ | pigz \
+ > fatcat_file.2021-11-26.tsv.gz
+
+ COPY fatcat_file FROM '/srv/sandcrawler/tasks/fatcat_file.2021-11-26.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL '');
+ # COPY 112086814
+
## `file_meta`
zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv