aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-04-23 17:32:40 -0700
committerBryan Newbold <bnewbold@archive.org>2019-04-23 17:32:40 -0700
commit71ed3d20c6898df32a31c9b1ecc843e56c976e9d (patch)
tree31a322c3a5d231dca7c0782c54f664b610c8584a
parentc508cde132f0ec8156c36c6ffd6592b089b8207a (diff)
downloadarabesque-71ed3d20c6898df32a31c9b1ecc843e56c976e9d.tar.gz
arabesque-71ed3d20c6898df32a31c9b1ecc843e56c976e9d.zip
add ident row cap in dump_json
-rwxr-xr-xarabesque.py21
1 files changed, 18 insertions, 3 deletions
diff --git a/arabesque.py b/arabesque.py
index 8e76e4c..8dbc0ca 100755
--- a/arabesque.py
+++ b/arabesque.py
@@ -631,15 +631,27 @@ def postprocess(sha1_status_file, output_db):
print(counts)
return counts
-def dump_json(read_db, only_identifier_hits=False):
+def dump_json(read_db, only_identifier_hits=False, max_per_identifier=None):
read_db.row_factory = sqlite3.Row
if only_identifier_hits:
- cur = read_db.execute("SELECT * FROM crawl_result WHERE hit = 1 AND identifier IS NOT NULL;")
+ sys.stderr.write("Only dumping hits with identifiers\n\r")
+ cur = read_db.execute("SELECT * FROM crawl_result WHERE hit = 1 AND identifier IS NOT NULL ORDER BY identifier;")
else:
- cur = read_db.execute("SELECT * FROM crawl_result;")
+ sys.stderr.write("Dumping all rows\n\r")
+ cur = read_db.execute("SELECT * FROM crawl_result ORDER BY identifier;")
+ last_ident = None
+ ident_count = 0
for row in cur:
+ if last_ident and row[0] == last_ident:
+ ident_count += 1
+ if max_per_identifier and ident_count > max_per_identifier:
+ print("SKIPPING identifier maxed out: {}".format(last_ident))
+ continue
+ else:
+ ident_count = 0
+ last_ident = row[0]
print(json.dumps(dict(row)))
def main():
@@ -705,6 +717,9 @@ def main():
sub_dump_json.add_argument("--only-identifier-hits",
action="store_true",
help="only dump rows where hit=true and identifier is non-null")
+ sub_dump_json.add_argument("--max-per-identifier",
+ default=False, type=int,
+ help="don't dump more than this many rows per unique identifier")
parser.add_argument("--html-hit",
action="store_true",