diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-08-09 16:57:52 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-08-09 16:57:52 -0700 |
commit | db711e1ac9e4b8ba8c9d5229bf310e89ffc27a47 (patch) | |
tree | d977dfee48cfd947aec3c424733aebe74646d25e /postgrest/backfill/backfill_file_meta.py | |
parent | 9e4657d49dd91f1249042865505d1a9ea8ad2ea6 (diff) | |
download | sandcrawler-db711e1ac9e4b8ba8c9d5229bf310e89ffc27a47.tar.gz sandcrawler-db711e1ac9e4b8ba8c9d5229bf310e89ffc27a47.zip |
move postgres/rest directory
Diffstat (limited to 'postgrest/backfill/backfill_file_meta.py')
-rwxr-xr-x | postgrest/backfill/backfill_file_meta.py | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/postgrest/backfill/backfill_file_meta.py b/postgrest/backfill/backfill_file_meta.py new file mode 100755 index 0000000..e3b40a0 --- /dev/null +++ b/postgrest/backfill/backfill_file_meta.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +""" +This is a "one-time" tranform helper script for file_meta backfill into +sandcrawler postgresql. + +Most of this file was copied from '../python/common.py'. +""" + +import json, os, sys, collections +import psycopg2 +import psycopg2.extras + + +def insert(cur, batch): + sql = """ + INSERT INTO + file_meta + VALUES %s + ON CONFLICT DO NOTHING; + """ + res = psycopg2.extras.execute_values(cur, sql, batch) + +def stdin_to_pg(): + # no host means it will use local domain socket by default + conn = psycopg2.connect(database="sandcrawler", user="postgres") + cur = conn.cursor() + counts = collections.Counter({'total': 0}) + batch = [] + for l in sys.stdin: + if counts['raw_lines'] > 0 and counts['raw_lines'] % 10000 == 0: + print("Progress: {}...".format(counts)) + counts['raw_lines'] += 1 + if not l.strip(): + continue + info = l.split("\t") + if not info: + continue + assert len(info) == 5 + info[-1] = info[-1].strip() or None + batch.append(info) + counts['total'] += 1 + if len(batch) >= 1000: + insert(cur, batch) + conn.commit() + batch = [] + counts['batches'] += 1 + if batch: + insert(cur, batch) + batch = [] + conn.commit() + cur.close() + print("Done: {}".format(counts)) + +if __name__=='__main__': + stdin_to_pg() |