diff options
Diffstat (limited to 'postgresql/backfill_file_meta.py')
-rwxr-xr-x | postgresql/backfill_file_meta.py | 55 |
1 files changed, 0 insertions, 55 deletions
diff --git a/postgresql/backfill_file_meta.py b/postgresql/backfill_file_meta.py deleted file mode 100755 index e3b40a0..0000000 --- a/postgresql/backfill_file_meta.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -""" -This is a "one-time" tranform helper script for file_meta backfill into -sandcrawler postgresql. - -Most of this file was copied from '../python/common.py'. -""" - -import json, os, sys, collections -import psycopg2 -import psycopg2.extras - - -def insert(cur, batch): - sql = """ - INSERT INTO - file_meta - VALUES %s - ON CONFLICT DO NOTHING; - """ - res = psycopg2.extras.execute_values(cur, sql, batch) - -def stdin_to_pg(): - # no host means it will use local domain socket by default - conn = psycopg2.connect(database="sandcrawler", user="postgres") - cur = conn.cursor() - counts = collections.Counter({'total': 0}) - batch = [] - for l in sys.stdin: - if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: - print("Progress: {}...".format(counts)) - counts['raw_lines'] += 1 - if not l.strip(): - continue - info = l.split("\t") - if not info: - continue - assert len(info) == 5 - info[-1] = info[-1].strip() or None - batch.append(info) - counts['total'] += 1 - if len(batch) >= 1000: - insert(cur, batch) - conn.commit() - batch = [] - counts['batches'] += 1 - if batch: - insert(cur, batch) - batch = [] - conn.commit() - cur.close() - print("Done: {}".format(counts)) - -if __name__=='__main__': - stdin_to_pg() |