diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-24 18:40:39 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-24 18:40:39 -0700 |
commit | c2dbb9b5299ba7cfc4e2328ce3a4ef6c2882dc9e (patch) | |
tree | 06b267ec0883b6f4e5bdb41413e953192ff07b12 /python/common.py | |
parent | 1f989c851247115784d5bc877341f1e8d7ff5f98 (diff) | |
download | sandcrawler-c2dbb9b5299ba7cfc4e2328ce3a4ef6c2882dc9e.tar.gz sandcrawler-c2dbb9b5299ba7cfc4e2328ce3a4ef6c2882dc9e.zip |
python extraction_ungrobided job
Diffstat (limited to 'python/common.py')
-rw-r--r-- | python/common.py | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/python/common.py b/python/common.py index 6710044..e596b35 100644 --- a/python/common.py +++ b/python/common.py @@ -1,4 +1,5 @@ +import json from datetime import datetime NORMAL_MIME = ( @@ -71,3 +72,28 @@ def parse_cdx_line(raw_cdx): # 'i' intentionally not set heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1) return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix} + +def parse_ungrobided_line(raw_line): + + line = raw_line.strip().split("\t") + if len(line) != 4: + return None + + key = line[0] + mime = normalize_mime(line[2]) + try: + f_c = json.loads(line[1]) + cdx = json.loads(line[3]) + except json.JSONDecodeError: + return None + + if not (key[5:].isalnum() and len(key) == 37 and mime != None): + print(mime) + print(key) + print("FAIL") + return None + + if '-' in (key, mime, f_c, cdx): + return None + + return {'key': key, 'file:mime': mime, 'file:cdx': cdx, 'f:c': f_c} |