re-write parse_cdx_line for sandcrawler lib

author: Bryan Newbold <bnewbold@archive.org> 2019-09-25 17:35:46 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2019-09-25 17:35:46 -0700
commit: 6e24eec4b6d1861eba37a0a05220b257e829ebbb (patch)
tree: eb3cc5d5a3dc0d35a332b45b993a0716d0c933a7
parent: a3383f8794bcd8aa9195de37c63f040086d57f77 (diff)
download: sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.tar.gz
sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.zip
3 files changed, 116 insertions, 2 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 0120287..0691b6e 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,3 +1,3 @@
 
 from .grobid import GrobidClient
-from .misc import gen_file_metadata, b32_hex
+from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index e13b5e7..f741f93 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -2,6 +2,7 @@
 import base64
 import magic
 import hashlib
+import datetime
 
 def gen_file_metadata(blob):
     """
@@ -41,3 +42,86 @@ def b32_hex(s):
         raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
     return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
 
+NORMAL_MIME = (
+    'application/pdf',
+    'application/postscript',
+    'text/html',
+    'text/xml',
+)
+
+def normalize_mime(raw):
+    raw = raw.lower()
+    for norm in NORMAL_MIME:
+        if raw.startswith(norm):
+            return norm
+
+    # Special cases
+    if raw.startswith('application/xml'):
+        return 'text/xml'
+    if raw.startswith('application/x-pdf'):
+        return 'application/pdf'
+    return None
+
+
+def test_normalize_mime():
+    assert normalize_mime("asdf") is None
+    assert normalize_mime("application/pdf") == "application/pdf"
+    assert normalize_mime("application/pdf+journal") == "application/pdf"
+    assert normalize_mime("Application/PDF") == "application/pdf"
+    assert normalize_mime("application/p") is None
+    assert normalize_mime("application/xml+stuff") == "text/xml"
+    assert normalize_mime("application/x-pdf") == "application/pdf"
+    assert normalize_mime("application/x-html") is None
+
+
+def parse_cdx_line(raw_cdx, normalize=True):
+
+    cdx = raw_cdx.split()
+    if len(cdx) < 11:
+        return None
+
+    surt = cdx[0]
+    dt = cdx[1]
+    url = cdx[2]
+    mime = normalize_mime(cdx[3])
+    if normalize:
+        mime = normalize_mime(mime)
+    http_status = cdx[4]
+    sha1b32 = cdx[5]
+    c_size = cdx[8]
+    offset = cdx[9]
+    warc = cdx[10]
+
+    if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
+            and len(sha1b32) == 32 and dt.isdigit()):
+        return None
+
+    if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+        return None
+
+    if mime is None or mime == '-':
+        mime = "application/octet-stream"
+
+    sha1hex = b32_hex(sha1b32)
+    http_status = int(http_status)
+    c_size = int(c_size)
+    offset = int(offset)
+
+    return dict(
+        surt=surt,
+        url=url,
+        datetime=dt,
+        mimetype=mime,
+        http_status=http_status,
+        sha1b32=sha1b32,
+        sha1hex=sha1hex,
+        c_size=c_size,
+        offset=offset,
+        warc=warc,
+    )
+
+def parse_cdx_datetime(dt_str):
+    try:
+        return datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+    except Exception:
+        return None
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index a7879c8..02deec9 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,7 @@
 
 import pytest
 
-from sandcrawler import gen_file_metadata, b32_hex
+from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line
 
 def test_gen_file_metadata():
     
@@ -39,3 +39,33 @@ def test_b32_hex():
     # invalid
     with pytest.raises(ValueError):
         assert b32_hex('blah') == 'blah'
+
+def test_parse_cdx_line():
+
+    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+    correct = {
+        'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+        'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+        'mimetype': "application/pdf",
+        'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+        'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+        'datetime': "20170828233154",
+        'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+        'offset': 931661233,
+        'c_size': 210251,
+        'http_status': 200,
+    }
+
+    assert parse_cdx_line(raw) == correct
+    assert parse_cdx_line(raw + "\n") == correct
+    assert parse_cdx_line(raw + " extra_field") == correct
+
+def test_invalid_cdx():
+
+    print("missing warc")
+    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
+    assert parse_cdx_line(raw) == None
+
+    print("bad datetime")
+    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" 
+    assert parse_cdx_line(raw) == None
author	Bryan Newbold <bnewbold@archive.org>	2019-09-25 17:35:46 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2019-09-25 17:35:46 -0700
commit	6e24eec4b6d1861eba37a0a05220b257e829ebbb (patch)
tree	eb3cc5d5a3dc0d35a332b45b993a0716d0c933a7
parent	a3383f8794bcd8aa9195de37c63f040086d57f77 (diff)
download	sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.tar.gz sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.zip