From b362abd38ad4a6624bc056c58eb90ae235c63f00 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 23 Sep 2019 23:00:23 -0700 Subject: rename postgrest directory sql --- sql/backfill/backfill_file_meta.py | 55 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100755 sql/backfill/backfill_file_meta.py (limited to 'sql/backfill/backfill_file_meta.py') diff --git a/sql/backfill/backfill_file_meta.py b/sql/backfill/backfill_file_meta.py new file mode 100755 index 0000000..e3b40a0 --- /dev/null +++ b/sql/backfill/backfill_file_meta.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +""" +This is a "one-time" tranform helper script for file_meta backfill into +sandcrawler postgresql. + +Most of this file was copied from '../python/common.py'. +""" + +import json, os, sys, collections +import psycopg2 +import psycopg2.extras + + +def insert(cur, batch): + sql = """ + INSERT INTO + file_meta + VALUES %s + ON CONFLICT DO NOTHING; + """ + res = psycopg2.extras.execute_values(cur, sql, batch) + +def stdin_to_pg(): + # no host means it will use local domain socket by default + conn = psycopg2.connect(database="sandcrawler", user="postgres") + cur = conn.cursor() + counts = collections.Counter({'total': 0}) + batch = [] + for l in sys.stdin: + if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: + print("Progress: {}...".format(counts)) + counts['raw_lines'] += 1 + if not l.strip(): + continue + info = l.split("\t") + if not info: + continue + assert len(info) == 5 + info[-1] = info[-1].strip() or None + batch.append(info) + counts['total'] += 1 + if len(batch) >= 1000: + insert(cur, batch) + conn.commit() + batch = [] + counts['batches'] += 1 + if batch: + insert(cur, batch) + batch = [] + conn.commit() + cur.close() + print("Done: {}".format(counts)) + +if __name__=='__main__': + stdin_to_pg() -- cgit v1.2.3