diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-04-23 17:32:14 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-04-23 17:32:14 -0700 |
commit | c508cde132f0ec8156c36c6ffd6592b089b8207a (patch) | |
tree | 0bc0d11dc9bfe6793ec087ad58b03e1f63ae0e94 | |
parent | 39150d0fec3a444d9fa2786aa19e7c098c8247df (diff) | |
download | arabesque-c508cde132f0ec8156c36c6ffd6592b089b8207a.tar.gz arabesque-c508cde132f0ec8156c36c6ffd6592b089b8207a.zip |
create SHA-1 index at end of forward (not in post-process)
-rwxr-xr-x | arabesque.py | 8 |
1 files changed, 2 insertions, 6 deletions
diff --git a/arabesque.py b/arabesque.py index 9b12b68..8e76e4c 100755 --- a/arabesque.py +++ b/arabesque.py @@ -570,6 +570,7 @@ def forward(seed_id_file, map_db, output_db): c.executescript(""" CREATE INDEX IF NOT EXISTS result_initial_url on crawl_result (initial_url); CREATE INDEX IF NOT EXISTS result_identifier on crawl_result (identifier); + CREATE INDEX IF NOT EXISTS result_final_sha1 on crawl_result (final_sha1); """) c.close() print("Forward map complete.") @@ -587,16 +588,11 @@ def everything(log_file, seed_id_file, map_db, output_db, hit_mimetypes=FULLTEXT def postprocess(sha1_status_file, output_db): print("Updating database with post-processing status") - print("""If script fails, you may need to manually: + print("""If script fails (on old databases) you may need to manually: ALTER TABLE crawl_result ADD COLUMN postproc_status text;""") counts = collections.Counter({'lines-parsed': 0}) c = output_db.cursor() - print("Building SHA-1 index (this can be slow)...") - c.executescript(""" - CREATE INDEX IF NOT EXISTS result_final_sha1 on crawl_result (final_sha1); - """) - i = 0 for raw_line in sha1_status_file: line = raw_line.strip().split('\t') |