diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-27 14:32:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-27 14:32:19 -0700 |
commit | a7156b06340460e0e70a19891e161b8b8f4f2078 (patch) | |
tree | 029a04f6558b99f060e845ab1ef00baa6cd39600 /python/common.py | |
parent | 4c374c647d8fecce827cabcb579e5aae20f198db (diff) | |
parent | d2b4da4c55a24468a0cbfdc9f567449d4e913331 (diff) | |
download | sandcrawler-a7156b06340460e0e70a19891e161b8b8f4f2078.tar.gz sandcrawler-a7156b06340460e0e70a19891e161b8b8f4f2078.zip |
Merge branch 'bnewbold-ungrobided'
Diffstat (limited to 'python/common.py')
-rw-r--r-- | python/common.py | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/python/common.py b/python/common.py index 6710044..e596b35 100644 --- a/python/common.py +++ b/python/common.py @@ -1,4 +1,5 @@ +import json from datetime import datetime NORMAL_MIME = ( @@ -71,3 +72,28 @@ def parse_cdx_line(raw_cdx): # 'i' intentionally not set heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1) return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix} + +def parse_ungrobided_line(raw_line): + + line = raw_line.strip().split("\t") + if len(line) != 4: + return None + + key = line[0] + mime = normalize_mime(line[2]) + try: + f_c = json.loads(line[1]) + cdx = json.loads(line[3]) + except json.JSONDecodeError: + return None + + if not (key[5:].isalnum() and len(key) == 37 and mime != None): + print(mime) + print(key) + print("FAIL") + return None + + if '-' in (key, mime, f_c, cdx): + return None + + return {'key': key, 'file:mime': mime, 'file:cdx': cdx, 'f:c': f_c} |