aboutsummaryrefslogtreecommitdiffstats
path: root/sql/backfill/backfill_file_meta.py
diff options
context:
space:
mode:
Diffstat (limited to 'sql/backfill/backfill_file_meta.py')
-rwxr-xr-xsql/backfill/backfill_file_meta.py55
1 files changed, 55 insertions, 0 deletions
diff --git a/sql/backfill/backfill_file_meta.py b/sql/backfill/backfill_file_meta.py
new file mode 100755
index 0000000..e3b40a0
--- /dev/null
+++ b/sql/backfill/backfill_file_meta.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""
+This is a "one-time" tranform helper script for file_meta backfill into
+sandcrawler postgresql.
+
+Most of this file was copied from '../python/common.py'.
+"""
+
+import json, os, sys, collections
+import psycopg2
+import psycopg2.extras
+
+
+def insert(cur, batch):
+ sql = """
+ INSERT INTO
+ file_meta
+ VALUES %s
+ ON CONFLICT DO NOTHING;
+ """
+ res = psycopg2.extras.execute_values(cur, sql, batch)
+
+def stdin_to_pg():
+ # no host means it will use local domain socket by default
+ conn = psycopg2.connect(database="sandcrawler", user="postgres")
+ cur = conn.cursor()
+ counts = collections.Counter({'total': 0})
+ batch = []
+ for l in sys.stdin:
+ if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0:
+ print("Progress: {}...".format(counts))
+ counts['raw_lines'] += 1
+ if not l.strip():
+ continue
+ info = l.split("\t")
+ if not info:
+ continue
+ assert len(info) == 5
+ info[-1] = info[-1].strip() or None
+ batch.append(info)
+ counts['total'] += 1
+ if len(batch) >= 1000:
+ insert(cur, batch)
+ conn.commit()
+ batch = []
+ counts['batches'] += 1
+ if batch:
+ insert(cur, batch)
+ batch = []
+ conn.commit()
+ cur.close()
+ print("Done: {}".format(counts))
+
+if __name__=='__main__':
+ stdin_to_pg()