From 84169ce643075f1e49b18744d5609c7f1c48e7f7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 17 Jun 2020 11:26:11 -0700 Subject: remove unused common.py --- python/common.py | 99 -------------------------------------------------------- 1 file changed, 99 deletions(-) delete mode 100644 python/common.py (limited to 'python/common.py') diff --git a/python/common.py b/python/common.py deleted file mode 100644 index e596b35..0000000 --- a/python/common.py +++ /dev/null @@ -1,99 +0,0 @@ - -import json -from datetime import datetime - -NORMAL_MIME = ( - 'application/pdf', - 'application/postscript', - 'text/html', - 'text/xml', -) - -def normalize_mime(raw): - raw = raw.lower() - for norm in NORMAL_MIME: - if raw.startswith(norm): - return norm - - # Special cases - if raw.startswith('application/xml'): - return 'text/xml' - if raw.startswith('application/x-pdf'): - return 'application/pdf' - return None - - -def test_normalize_mime(): - assert normalize_mime("asdf") is None - assert normalize_mime("application/pdf") == "application/pdf" - assert normalize_mime("application/pdf+journal") == "application/pdf" - assert normalize_mime("Application/PDF") == "application/pdf" - assert normalize_mime("application/p") is None - assert normalize_mime("application/xml+stuff") == "text/xml" - assert normalize_mime("application/x-pdf") == "application/pdf" - assert normalize_mime("application/x-html") is None - - -def parse_cdx_line(raw_cdx): - - cdx = raw_cdx.split() - if len(cdx) < 11: - return None - - surt = cdx[0] - dt = cdx[1] - url = cdx[2] - mime = normalize_mime(cdx[3]) - http_status = cdx[4] - key = cdx[5] - c_size = cdx[8] - offset = cdx[9] - warc = cdx[10] - - if not (key.isalnum() and c_size.isdigit() and offset.isdigit() - and http_status == "200" and len(key) == 32 and dt.isdigit() - and mime != None): - return None - - if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc): - return None - - key = "sha1:{}".format(key) - - info = dict(surt=surt, dt=dt, url=url, c_size=int(c_size), - offset=int(offset), warc=warc) - - warc_file = warc.split('/')[-1] - try: - dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat() - except Exception: - return None - - # 'i' intentionally not set - heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1) - return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix} - -def parse_ungrobided_line(raw_line): - - line = raw_line.strip().split("\t") - if len(line) != 4: - return None - - key = line[0] - mime = normalize_mime(line[2]) - try: - f_c = json.loads(line[1]) - cdx = json.loads(line[3]) - except json.JSONDecodeError: - return None - - if not (key[5:].isalnum() and len(key) == 37 and mime != None): - print(mime) - print(key) - print("FAIL") - return None - - if '-' in (key, mime, f_c, cdx): - return None - - return {'key': key, 'file:mime': mime, 'file:cdx': cdx, 'f:c': f_c} -- cgit v1.2.3