diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-04-12 17:08:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-04-12 17:08:01 -0700 |
commit | 50a076b46842b32288c89d199d28e54032341f49 (patch) | |
tree | e9efb22f71a58ffde7aa537f683ba202a2f4ff1d | |
parent | 4f40ea7d0cb19ceac15c28b61c479a66895cea2d (diff) | |
download | arabesque-50a076b46842b32288c89d199d28e54032341f49.tar.gz arabesque-50a076b46842b32288c89d199d28e54032341f49.zip |
add support for a post-processing column
-rwxr-xr-x | arabesque.py | 71 | ||||
-rw-r--r-- | examples/grobid_status_codes.tsv | 6 | ||||
-rw-r--r-- | examples/output.sqlite3 | bin | 262144 -> 278528 bytes |
3 files changed, 72 insertions, 5 deletions
diff --git a/arabesque.py b/arabesque.py index 07c05b5..b5b2224 100755 --- a/arabesque.py +++ b/arabesque.py @@ -265,7 +265,8 @@ def create_out_table(db): final_sha1 text, final_mimetype text, final_was_dedupe bool, - hit bool); + hit bool, + postproc_status text); """) def referrer(log_file, map_db): @@ -446,8 +447,8 @@ def backward(log_file, map_db, output_db, hit_mimetypes=FULLTEXT_MIMETYPES): final_timestamp = None if len(line.timestamp) >= 12 and line.timestamp[4] != '-': final_timestamp = line.timestamp[:12] - c.execute("INSERT INTO crawl_result VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", - (row.url, None, initial_domain, final_row.breadcrumbs, final_row.url, final_domain, final_timestamp, final_row.status_code, line.sha1, final_row.mimetype, final_row.is_dedupe, True)) + c.execute("INSERT INTO crawl_result VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)", + (row.url, None, initial_domain, final_row.breadcrumbs, final_row.url, final_domain, final_timestamp, final_row.status_code, line.sha1, final_row.mimetype, final_row.is_dedupe, True, None)) #print(final_row.breadcrumbs) i = i+1 counts['inserted'] += 1 @@ -548,8 +549,8 @@ def forward(seed_id_file, map_db, output_db): final_domain = urllib3.util.parse_url(final_row.url).host # TODO: would pass SHA1 here if we had it? but not stored in referrer table # XXX: None => timestamp - c.execute("INSERT INTO crawl_result VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", - (seed_url, identifier, initial_domain, final_row.breadcrumbs, final_row.url, final_domain, None, final_row.status_code, None, final_row.mimetype, final_row.is_dedupe, False)) + c.execute("INSERT INTO crawl_result VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)", + (seed_url, identifier, initial_domain, final_row.breadcrumbs, final_row.url, final_domain, None, final_row.status_code, None, final_row.mimetype, final_row.is_dedupe, False, None)) #print(final_row.breadcrumbs) i = i+1 counts['inserted'] += 1 @@ -578,6 +579,56 @@ def everything(log_file, seed_id_file, map_db, output_db, hit_mimetypes=FULLTEXT print(bcounts) print(fcounts) +def postprocess(sha1_status_file, output_db): + print("Updating database with post-processing status") + print("""If script fails, you may need to manually: + ALTER TABLE crawl_result ADD COLUMN postproc_status text;""") + counts = collections.Counter({'lines-parsed': 0}) + c = output_db.cursor() + + print("Building SHA-1 index (this can be slow)...") + c.executescript(""" + CREATE INDEX IF NOT EXISTS result_final_sha1 on crawl_result (final_sha1); + """) + + i = 0 + for raw_line in sha1_status_file: + line = raw_line.strip().split('\t') + if not line or len(line) == 1: + counts['skip-raw-line'] += 1 + continue + if len(line) == 2: + sha1, status = line[0:2] + else: + print("WEIRD: {}".format(raw_line)) + assert len(line) <= 2 + + # parse/validate SHA-1 + if sha1.startswith("sha1:"): + sha1 = sha1[5:] + if not len(sha1) == 32: + counts['skip-bad-sha1'] += 1 + continue + status = status.strip() + + res = c.execute('UPDATE crawl_result SET postproc_status=? WHERE final_sha1=?', [status, sha1]) + if res.rowcount == 0: + counts['sha1-not-found'] += 1 + else: + counts['rows-updated'] += res.rowcount + + i = i+1 + if i % 2000 == 0: + print("... postprocess {}".format(i)) + output_db.commit() + + output_db.commit() + + c.close() + print("Forward map complete.") + print(counts) + return counts + def main(): parser = argparse.ArgumentParser() subparsers = parser.add_subparsers() @@ -627,6 +678,13 @@ def main(): sub_everything.add_argument("--map_db_file", default=":memory:", type=str) + sub_postprocess = subparsers.add_parser('postprocess') + sub_postprocess.set_defaults(func=postprocess) + sub_postprocess.add_argument("sha1_status_file", + default=sys.stdin, type=argparse.FileType('rt')) + sub_postprocess.add_argument("db_file", + type=str) + parser.add_argument("--html-hit", action="store_true", help="run in mode that considers only terminal HTML success") @@ -666,6 +724,9 @@ def main(): sqlite3.connect(args.map_db_file), sqlite3.connect(args.output_db_file, isolation_level='EXCLUSIVE'), hit_mimetypes=hit_mimetypes) + elif args.func is postprocess: + postprocess(args.sha1_status_file, + sqlite3.connect(args.db_file, isolation_level='EXCLUSIVE')) else: raise NotImplementedError diff --git a/examples/grobid_status_codes.tsv b/examples/grobid_status_codes.tsv new file mode 100644 index 0000000..1f7a45e --- /dev/null +++ b/examples/grobid_status_codes.tsv @@ -0,0 +1,6 @@ +sha1:QZJO4VAXQPVX3XA5DXZEKURTKVEPH5FE 200 +sha1:JQQMQSUJEATRDLI3UWGRHTVREJVRTUJ3 200 +sha1:KKSZMZOTULQNXFHQKO4VGMXWI36NIZKH 500 +sha1:DC3LIDN3RW2GCANRBWQJRSWWQR4XXWQP 200 +sha1:RUOELDT3YYFZTPGH5CUAF7KLJOQRD6NF 400 +sha1:DKILDQA3LYZETFTZMB33KHMQUW6VWW5U 200 diff --git a/examples/output.sqlite3 b/examples/output.sqlite3 Binary files differindex b86e281..4a202a8 100644 --- a/examples/output.sqlite3 +++ b/examples/output.sqlite3 |