aboutsummaryrefslogtreecommitdiffstats
path: root/python/common.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-24 18:40:39 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-24 18:40:39 -0700
commitc2dbb9b5299ba7cfc4e2328ce3a4ef6c2882dc9e (patch)
tree06b267ec0883b6f4e5bdb41413e953192ff07b12 /python/common.py
parent1f989c851247115784d5bc877341f1e8d7ff5f98 (diff)
downloadsandcrawler-c2dbb9b5299ba7cfc4e2328ce3a4ef6c2882dc9e.tar.gz
sandcrawler-c2dbb9b5299ba7cfc4e2328ce3a4ef6c2882dc9e.zip
python extraction_ungrobided job
Diffstat (limited to 'python/common.py')
-rw-r--r--python/common.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/python/common.py b/python/common.py
index 6710044..e596b35 100644
--- a/python/common.py
+++ b/python/common.py
@@ -1,4 +1,5 @@
+import json
from datetime import datetime
NORMAL_MIME = (
@@ -71,3 +72,28 @@ def parse_cdx_line(raw_cdx):
# 'i' intentionally not set
heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix}
+
+def parse_ungrobided_line(raw_line):
+
+ line = raw_line.strip().split("\t")
+ if len(line) != 4:
+ return None
+
+ key = line[0]
+ mime = normalize_mime(line[2])
+ try:
+ f_c = json.loads(line[1])
+ cdx = json.loads(line[3])
+ except json.JSONDecodeError:
+ return None
+
+ if not (key[5:].isalnum() and len(key) == 37 and mime != None):
+ print(mime)
+ print(key)
+ print("FAIL")
+ return None
+
+ if '-' in (key, mime, f_c, cdx):
+ return None
+
+ return {'key': key, 'file:mime': mime, 'file:cdx': cdx, 'f:c': f_c}