aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-04-23 17:32:14 -0700
committerBryan Newbold <bnewbold@archive.org>2019-04-23 17:32:14 -0700
commitc508cde132f0ec8156c36c6ffd6592b089b8207a (patch)
tree0bc0d11dc9bfe6793ec087ad58b03e1f63ae0e94
parent39150d0fec3a444d9fa2786aa19e7c098c8247df (diff)
downloadarabesque-c508cde132f0ec8156c36c6ffd6592b089b8207a.tar.gz
arabesque-c508cde132f0ec8156c36c6ffd6592b089b8207a.zip
create SHA-1 index at end of forward (not in post-process)
-rwxr-xr-xarabesque.py8
1 files changed, 2 insertions, 6 deletions
diff --git a/arabesque.py b/arabesque.py
index 9b12b68..8e76e4c 100755
--- a/arabesque.py
+++ b/arabesque.py
@@ -570,6 +570,7 @@ def forward(seed_id_file, map_db, output_db):
c.executescript("""
CREATE INDEX IF NOT EXISTS result_initial_url on crawl_result (initial_url);
CREATE INDEX IF NOT EXISTS result_identifier on crawl_result (identifier);
+ CREATE INDEX IF NOT EXISTS result_final_sha1 on crawl_result (final_sha1);
""")
c.close()
print("Forward map complete.")
@@ -587,16 +588,11 @@ def everything(log_file, seed_id_file, map_db, output_db, hit_mimetypes=FULLTEXT
def postprocess(sha1_status_file, output_db):
print("Updating database with post-processing status")
- print("""If script fails, you may need to manually:
+ print("""If script fails (on old databases) you may need to manually:
ALTER TABLE crawl_result ADD COLUMN postproc_status text;""")
counts = collections.Counter({'lines-parsed': 0})
c = output_db.cursor()
- print("Building SHA-1 index (this can be slow)...")
- c.executescript("""
- CREATE INDEX IF NOT EXISTS result_final_sha1 on crawl_result (final_sha1);
- """)
-
i = 0
for raw_line in sha1_status_file:
line = raw_line.strip().split('\t')