From c2dbb9b5299ba7cfc4e2328ce3a4ef6c2882dc9e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 24 Aug 2018 18:40:39 -0700 Subject: python extraction_ungrobided job --- python/common.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'python/common.py') diff --git a/python/common.py b/python/common.py index 6710044..e596b35 100644 --- a/python/common.py +++ b/python/common.py @@ -1,4 +1,5 @@ +import json from datetime import datetime NORMAL_MIME = ( @@ -71,3 +72,28 @@ def parse_cdx_line(raw_cdx): # 'i' intentionally not set heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1) return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix} + +def parse_ungrobided_line(raw_line): + + line = raw_line.strip().split("\t") + if len(line) != 4: + return None + + key = line[0] + mime = normalize_mime(line[2]) + try: + f_c = json.loads(line[1]) + cdx = json.loads(line[3]) + except json.JSONDecodeError: + return None + + if not (key[5:].isalnum() and len(key) == 37 and mime != None): + print(mime) + print(key) + print("FAIL") + return None + + if '-' in (key, mime, f_c, cdx): + return None + + return {'key': key, 'file:mime': mime, 'file:cdx': cdx, 'f:c': f_c} -- cgit v1.2.3