aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xarabesque.py19
1 files changed, 17 insertions, 2 deletions
diff --git a/arabesque.py b/arabesque.py
index 501e0bd..e67bc0b 100755
--- a/arabesque.py
+++ b/arabesque.py
@@ -632,7 +632,7 @@ def postprocess(sha1_status_file, output_db):
print(counts)
return counts
-def dump_json(read_db, only_identifier_hits=False, max_per_identifier=None):
+def dump_json(read_db, only_identifier_hits=False, max_per_identifier=None, only_direct_breadcrumbs=False):
read_db.row_factory = sqlite3.Row
if only_identifier_hits:
@@ -645,13 +645,24 @@ def dump_json(read_db, only_identifier_hits=False, max_per_identifier=None):
last_ident = None
ident_count = 0
for row in cur:
+ if only_direct_breadcrumbs:
+ # very conservative: must be a direct hit, or an embed. no link
+ # hops allowed. the redirect ignoring logic is easier to express in
+ # python than SQL
+ # TODO: do the work and express in SQL so we can have sane reporting
+ bc = row[3]
+ bc = bc.replace('R', '')
+ if len(bc) > 1:
+ continue
+ if bc not in ('-', 'E', ''):
+ continue
if last_ident and row[1] == last_ident:
ident_count += 1
if max_per_identifier and ident_count > max_per_identifier:
sys.stderr.write("SKIPPING identifier maxed out: {}\n\r".format(last_ident))
continue
else:
- ident_count = 0
+ ident_count = 1
last_ident = row[1]
print(json.dumps(dict(row)))
@@ -718,6 +729,9 @@ def main():
sub_dump_json.add_argument("--only-identifier-hits",
action="store_true",
help="only dump rows where hit=true and identifier is non-null")
+ sub_dump_json.add_argument("--only-direct-breadcrumbs",
+ action="store_true",
+ help="only dump rows where breadcrumbs are clear (direct, redirect, etc)")
sub_dump_json.add_argument("--max-per-identifier",
default=False, type=int,
help="don't dump more than this many rows per unique identifier")
@@ -767,6 +781,7 @@ def main():
elif args.func is dump_json:
dump_json(sqlite3.connect(args.db_file, isolation_level='EXCLUSIVE'),
only_identifier_hits=args.only_identifier_hits,
+ only_direct_breadcrumbs=args.only_direct_breadcrumbs,
max_per_identifier=args.max_per_identifier)
else:
raise NotImplementedError