From c508cde132f0ec8156c36c6ffd6592b089b8207a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Apr 2019 17:32:14 -0700 Subject: create SHA-1 index at end of forward (not in post-process) --- arabesque.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arabesque.py b/arabesque.py index 9b12b68..8e76e4c 100755 --- a/arabesque.py +++ b/arabesque.py @@ -570,6 +570,7 @@ def forward(seed_id_file, map_db, output_db): c.executescript(""" CREATE INDEX IF NOT EXISTS result_initial_url on crawl_result (initial_url); CREATE INDEX IF NOT EXISTS result_identifier on crawl_result (identifier); + CREATE INDEX IF NOT EXISTS result_final_sha1 on crawl_result (final_sha1); """) c.close() print("Forward map complete.") @@ -587,16 +588,11 @@ def everything(log_file, seed_id_file, map_db, output_db, hit_mimetypes=FULLTEXT def postprocess(sha1_status_file, output_db): print("Updating database with post-processing status") - print("""If script fails, you may need to manually: + print("""If script fails (on old databases) you may need to manually: ALTER TABLE crawl_result ADD COLUMN postproc_status text;""") counts = collections.Counter({'lines-parsed': 0}) c = output_db.cursor() - print("Building SHA-1 index (this can be slow)...") - c.executescript(""" - CREATE INDEX IF NOT EXISTS result_final_sha1 on crawl_result (final_sha1); - """) - i = 0 for raw_line in sha1_status_file: line = raw_line.strip().split('\t') -- cgit v1.2.3