From 71ed3d20c6898df32a31c9b1ecc843e56c976e9d Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 23 Apr 2019 17:32:40 -0700
Subject: add ident row cap in dump_json

---
 arabesque.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/arabesque.py b/arabesque.py
index 8e76e4c..8dbc0ca 100755
--- a/arabesque.py
+++ b/arabesque.py
@@ -631,15 +631,27 @@ def postprocess(sha1_status_file, output_db):
     print(counts)
     return counts
 
-def dump_json(read_db, only_identifier_hits=False):
+def dump_json(read_db, only_identifier_hits=False, max_per_identifier=None):
 
     read_db.row_factory = sqlite3.Row
     if only_identifier_hits:
-        cur = read_db.execute("SELECT * FROM crawl_result WHERE hit = 1 AND identifier IS NOT NULL;")
+        sys.stderr.write("Only dumping hits with identifiers\n\r")
+        cur = read_db.execute("SELECT * FROM crawl_result WHERE hit = 1 AND identifier IS NOT NULL ORDER BY identifier;")
     else:
-        cur = read_db.execute("SELECT * FROM crawl_result;")
+        sys.stderr.write("Dumping all rows\n\r")
+        cur = read_db.execute("SELECT * FROM crawl_result ORDER BY identifier;")
 
+    last_ident = None
+    ident_count = 0
     for row in cur:
+        if last_ident and row[0] == last_ident:
+            ident_count += 1
+            if max_per_identifier and ident_count > max_per_identifier:
+                print("SKIPPING identifier maxed out: {}".format(last_ident))
+                continue
+        else:
+            ident_count = 0
+        last_ident = row[0]
         print(json.dumps(dict(row)))
 
 def main():
@@ -705,6 +717,9 @@ def main():
     sub_dump_json.add_argument("--only-identifier-hits",
         action="store_true",
         help="only dump rows where hit=true and identifier is non-null")
+    sub_dump_json.add_argument("--max-per-identifier",
+        default=False, type=int,
+        help="don't dump more than this many rows per unique identifier")
 
     parser.add_argument("--html-hit",
         action="store_true",
-- 
cgit v1.2.3