aboutsummaryrefslogtreecommitdiffstats
path: root/postgresql/backfill_file_meta.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-08-09 16:57:52 -0700
committerBryan Newbold <bnewbold@archive.org>2019-08-09 16:57:52 -0700
commitdb711e1ac9e4b8ba8c9d5229bf310e89ffc27a47 (patch)
treed977dfee48cfd947aec3c424733aebe74646d25e /postgresql/backfill_file_meta.py
parent9e4657d49dd91f1249042865505d1a9ea8ad2ea6 (diff)
downloadsandcrawler-db711e1ac9e4b8ba8c9d5229bf310e89ffc27a47.tar.gz
sandcrawler-db711e1ac9e4b8ba8c9d5229bf310e89ffc27a47.zip
move postgres/rest directory
Diffstat (limited to 'postgresql/backfill_file_meta.py')
-rwxr-xr-xpostgresql/backfill_file_meta.py55
1 files changed, 0 insertions, 55 deletions
diff --git a/postgresql/backfill_file_meta.py b/postgresql/backfill_file_meta.py
deleted file mode 100755
index e3b40a0..0000000
--- a/postgresql/backfill_file_meta.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-"""
-This is a "one-time" tranform helper script for file_meta backfill into
-sandcrawler postgresql.
-
-Most of this file was copied from '../python/common.py'.
-"""
-
-import json, os, sys, collections
-import psycopg2
-import psycopg2.extras
-
-
-def insert(cur, batch):
- sql = """
- INSERT INTO
- file_meta
- VALUES %s
- ON CONFLICT DO NOTHING;
- """
- res = psycopg2.extras.execute_values(cur, sql, batch)
-
-def stdin_to_pg():
- # no host means it will use local domain socket by default
- conn = psycopg2.connect(database="sandcrawler", user="postgres")
- cur = conn.cursor()
- counts = collections.Counter({'total': 0})
- batch = []
- for l in sys.stdin:
- if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0:
- print("Progress: {}...".format(counts))
- counts['raw_lines'] += 1
- if not l.strip():
- continue
- info = l.split("\t")
- if not info:
- continue
- assert len(info) == 5
- info[-1] = info[-1].strip() or None
- batch.append(info)
- counts['total'] += 1
- if len(batch) >= 1000:
- insert(cur, batch)
- conn.commit()
- batch = []
- counts['batches'] += 1
- if batch:
- insert(cur, batch)
- batch = []
- conn.commit()
- cur.close()
- print("Done: {}".format(counts))
-
-if __name__=='__main__':
- stdin_to_pg()