re-write parse_cdx_line for sandcrawler lib

author: Bryan Newbold <bnewbold@archive.org> 2019-09-25 17:35:46 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2019-09-25 17:35:46 -0700
commit: 6e24eec4b6d1861eba37a0a05220b257e829ebbb (patch)
tree: eb3cc5d5a3dc0d35a332b45b993a0716d0c933a7 /python/sandcrawler/misc.py
parent: a3383f8794bcd8aa9195de37c63f040086d57f77 (diff)
download: sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.tar.gz
sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.zip
1 files changed, 84 insertions, 0 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index e13b5e7..f741f93 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -2,6 +2,7 @@
 import base64
 import magic
 import hashlib
+import datetime
 
 def gen_file_metadata(blob):
     """
@@ -41,3 +42,86 @@ def b32_hex(s):
         raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
     return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
 
+NORMAL_MIME = (
+    'application/pdf',
+    'application/postscript',
+    'text/html',
+    'text/xml',
+)
+
+def normalize_mime(raw):
+    raw = raw.lower()
+    for norm in NORMAL_MIME:
+        if raw.startswith(norm):
+            return norm
+
+    # Special cases
+    if raw.startswith('application/xml'):
+        return 'text/xml'
+    if raw.startswith('application/x-pdf'):
+        return 'application/pdf'
+    return None
+
+
+def test_normalize_mime():
+    assert normalize_mime("asdf") is None
+    assert normalize_mime("application/pdf") == "application/pdf"
+    assert normalize_mime("application/pdf+journal") == "application/pdf"
+    assert normalize_mime("Application/PDF") == "application/pdf"
+    assert normalize_mime("application/p") is None
+    assert normalize_mime("application/xml+stuff") == "text/xml"
+    assert normalize_mime("application/x-pdf") == "application/pdf"
+    assert normalize_mime("application/x-html") is None
+
+
+def parse_cdx_line(raw_cdx, normalize=True):
+
+    cdx = raw_cdx.split()
+    if len(cdx) < 11:
+        return None
+
+    surt = cdx[0]
+    dt = cdx[1]
+    url = cdx[2]
+    mime = normalize_mime(cdx[3])
+    if normalize:
+        mime = normalize_mime(mime)
+    http_status = cdx[4]
+    sha1b32 = cdx[5]
+    c_size = cdx[8]
+    offset = cdx[9]
+    warc = cdx[10]
+
+    if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
+            and len(sha1b32) == 32 and dt.isdigit()):
+        return None
+
+    if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+        return None
+
+    if mime is None or mime == '-':
+        mime = "application/octet-stream"
+
+    sha1hex = b32_hex(sha1b32)
+    http_status = int(http_status)
+    c_size = int(c_size)
+    offset = int(offset)
+
+    return dict(
+        surt=surt,
+        url=url,
+        datetime=dt,
+        mimetype=mime,
+        http_status=http_status,
+        sha1b32=sha1b32,
+        sha1hex=sha1hex,
+        c_size=c_size,
+        offset=offset,
+        warc=warc,
+    )
+
+def parse_cdx_datetime(dt_str):
+    try:
+        return datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+    except Exception:
+        return None
author	Bryan Newbold <bnewbold@archive.org>	2019-09-25 17:35:46 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2019-09-25 17:35:46 -0700
commit	6e24eec4b6d1861eba37a0a05220b257e829ebbb (patch)
tree	eb3cc5d5a3dc0d35a332b45b993a0716d0c933a7 /python/sandcrawler/misc.py
parent	a3383f8794bcd8aa9195de37c63f040086d57f77 (diff)
download	sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.tar.gz sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.zip