aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/misc.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-25 17:35:46 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-25 17:35:46 -0700
commit6e24eec4b6d1861eba37a0a05220b257e829ebbb (patch)
treeeb3cc5d5a3dc0d35a332b45b993a0716d0c933a7 /python/sandcrawler/misc.py
parenta3383f8794bcd8aa9195de37c63f040086d57f77 (diff)
downloadsandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.tar.gz
sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.zip
re-write parse_cdx_line for sandcrawler lib
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r--python/sandcrawler/misc.py84
1 files changed, 84 insertions, 0 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index e13b5e7..f741f93 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -2,6 +2,7 @@
import base64
import magic
import hashlib
+import datetime
def gen_file_metadata(blob):
"""
@@ -41,3 +42,86 @@ def b32_hex(s):
raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+NORMAL_MIME = (
+ 'application/pdf',
+ 'application/postscript',
+ 'text/html',
+ 'text/xml',
+)
+
+def normalize_mime(raw):
+ raw = raw.lower()
+ for norm in NORMAL_MIME:
+ if raw.startswith(norm):
+ return norm
+
+ # Special cases
+ if raw.startswith('application/xml'):
+ return 'text/xml'
+ if raw.startswith('application/x-pdf'):
+ return 'application/pdf'
+ return None
+
+
+def test_normalize_mime():
+ assert normalize_mime("asdf") is None
+ assert normalize_mime("application/pdf") == "application/pdf"
+ assert normalize_mime("application/pdf+journal") == "application/pdf"
+ assert normalize_mime("Application/PDF") == "application/pdf"
+ assert normalize_mime("application/p") is None
+ assert normalize_mime("application/xml+stuff") == "text/xml"
+ assert normalize_mime("application/x-pdf") == "application/pdf"
+ assert normalize_mime("application/x-html") is None
+
+
+def parse_cdx_line(raw_cdx, normalize=True):
+
+ cdx = raw_cdx.split()
+ if len(cdx) < 11:
+ return None
+
+ surt = cdx[0]
+ dt = cdx[1]
+ url = cdx[2]
+ mime = normalize_mime(cdx[3])
+ if normalize:
+ mime = normalize_mime(mime)
+ http_status = cdx[4]
+ sha1b32 = cdx[5]
+ c_size = cdx[8]
+ offset = cdx[9]
+ warc = cdx[10]
+
+ if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
+ and len(sha1b32) == 32 and dt.isdigit()):
+ return None
+
+ if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+ return None
+
+ if mime is None or mime == '-':
+ mime = "application/octet-stream"
+
+ sha1hex = b32_hex(sha1b32)
+ http_status = int(http_status)
+ c_size = int(c_size)
+ offset = int(offset)
+
+ return dict(
+ surt=surt,
+ url=url,
+ datetime=dt,
+ mimetype=mime,
+ http_status=http_status,
+ sha1b32=sha1b32,
+ sha1hex=sha1hex,
+ c_size=c_size,
+ offset=offset,
+ warc=warc,
+ )
+
+def parse_cdx_datetime(dt_str):
+ try:
+ return datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+ except Exception:
+ return None