From a6d8ea8068109bd0d26d11e47d04249e81b485b2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 25 Apr 2019 19:54:51 -0700 Subject: add another hack-y filter option --- arabesque.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arabesque.py b/arabesque.py index 501e0bd..e67bc0b 100755 --- a/arabesque.py +++ b/arabesque.py @@ -632,7 +632,7 @@ def postprocess(sha1_status_file, output_db): print(counts) return counts -def dump_json(read_db, only_identifier_hits=False, max_per_identifier=None): +def dump_json(read_db, only_identifier_hits=False, max_per_identifier=None, only_direct_breadcrumbs=False): read_db.row_factory = sqlite3.Row if only_identifier_hits: @@ -645,13 +645,24 @@ def dump_json(read_db, only_identifier_hits=False, max_per_identifier=None): last_ident = None ident_count = 0 for row in cur: + if only_direct_breadcrumbs: + # very conservative: must be a direct hit, or an embed. no link + # hops allowed. the redirect ignoring logic is easier to express in + # python than SQL + # TODO: do the work and express in SQL so we can have sane reporting + bc = row[3] + bc = bc.replace('R', '') + if len(bc) > 1: + continue + if bc not in ('-', 'E', ''): + continue if last_ident and row[1] == last_ident: ident_count += 1 if max_per_identifier and ident_count > max_per_identifier: sys.stderr.write("SKIPPING identifier maxed out: {}\n\r".format(last_ident)) continue else: - ident_count = 0 + ident_count = 1 last_ident = row[1] print(json.dumps(dict(row))) @@ -718,6 +729,9 @@ def main(): sub_dump_json.add_argument("--only-identifier-hits", action="store_true", help="only dump rows where hit=true and identifier is non-null") + sub_dump_json.add_argument("--only-direct-breadcrumbs", + action="store_true", + help="only dump rows where breadcrumbs are clear (direct, redirect, etc)") sub_dump_json.add_argument("--max-per-identifier", default=False, type=int, help="don't dump more than this many rows per unique identifier") @@ -767,6 +781,7 @@ def main(): elif args.func is dump_json: dump_json(sqlite3.connect(args.db_file, isolation_level='EXCLUSIVE'), only_identifier_hits=args.only_identifier_hits, + only_direct_breadcrumbs=args.only_direct_breadcrumbs, max_per_identifier=args.max_per_identifier) else: raise NotImplementedError -- cgit v1.2.3