aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-25 17:35:46 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-25 17:35:46 -0700
commit6e24eec4b6d1861eba37a0a05220b257e829ebbb (patch)
treeeb3cc5d5a3dc0d35a332b45b993a0716d0c933a7
parenta3383f8794bcd8aa9195de37c63f040086d57f77 (diff)
downloadsandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.tar.gz
sandcrawler-6e24eec4b6d1861eba37a0a05220b257e829ebbb.zip
re-write parse_cdx_line for sandcrawler lib
-rw-r--r--python/sandcrawler/__init__.py2
-rw-r--r--python/sandcrawler/misc.py84
-rw-r--r--python/tests/test_misc.py32
3 files changed, 116 insertions, 2 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 0120287..0691b6e 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,3 +1,3 @@
from .grobid import GrobidClient
-from .misc import gen_file_metadata, b32_hex
+from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index e13b5e7..f741f93 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -2,6 +2,7 @@
import base64
import magic
import hashlib
+import datetime
def gen_file_metadata(blob):
"""
@@ -41,3 +42,86 @@ def b32_hex(s):
raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+NORMAL_MIME = (
+ 'application/pdf',
+ 'application/postscript',
+ 'text/html',
+ 'text/xml',
+)
+
+def normalize_mime(raw):
+ raw = raw.lower()
+ for norm in NORMAL_MIME:
+ if raw.startswith(norm):
+ return norm
+
+ # Special cases
+ if raw.startswith('application/xml'):
+ return 'text/xml'
+ if raw.startswith('application/x-pdf'):
+ return 'application/pdf'
+ return None
+
+
+def test_normalize_mime():
+ assert normalize_mime("asdf") is None
+ assert normalize_mime("application/pdf") == "application/pdf"
+ assert normalize_mime("application/pdf+journal") == "application/pdf"
+ assert normalize_mime("Application/PDF") == "application/pdf"
+ assert normalize_mime("application/p") is None
+ assert normalize_mime("application/xml+stuff") == "text/xml"
+ assert normalize_mime("application/x-pdf") == "application/pdf"
+ assert normalize_mime("application/x-html") is None
+
+
+def parse_cdx_line(raw_cdx, normalize=True):
+
+ cdx = raw_cdx.split()
+ if len(cdx) < 11:
+ return None
+
+ surt = cdx[0]
+ dt = cdx[1]
+ url = cdx[2]
+ mime = normalize_mime(cdx[3])
+ if normalize:
+ mime = normalize_mime(mime)
+ http_status = cdx[4]
+ sha1b32 = cdx[5]
+ c_size = cdx[8]
+ offset = cdx[9]
+ warc = cdx[10]
+
+ if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit()
+ and len(sha1b32) == 32 and dt.isdigit()):
+ return None
+
+ if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc):
+ return None
+
+ if mime is None or mime == '-':
+ mime = "application/octet-stream"
+
+ sha1hex = b32_hex(sha1b32)
+ http_status = int(http_status)
+ c_size = int(c_size)
+ offset = int(offset)
+
+ return dict(
+ surt=surt,
+ url=url,
+ datetime=dt,
+ mimetype=mime,
+ http_status=http_status,
+ sha1b32=sha1b32,
+ sha1hex=sha1hex,
+ c_size=c_size,
+ offset=offset,
+ warc=warc,
+ )
+
+def parse_cdx_datetime(dt_str):
+ try:
+ return datetime.strptime(dt_str, "%Y%m%d%H%M%S")
+ except Exception:
+ return None
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index a7879c8..02deec9 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,7 @@
import pytest
-from sandcrawler import gen_file_metadata, b32_hex
+from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line
def test_gen_file_metadata():
@@ -39,3 +39,33 @@ def test_b32_hex():
# invalid
with pytest.raises(ValueError):
assert b32_hex('blah') == 'blah'
+
+def test_parse_cdx_line():
+
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ correct = {
+ 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6",
+ 'mimetype': "application/pdf",
+ 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ 'datetime': "20170828233154",
+ 'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ 'offset': 931661233,
+ 'c_size': 210251,
+ 'http_status': 200,
+ }
+
+ assert parse_cdx_line(raw) == correct
+ assert parse_cdx_line(raw + "\n") == correct
+ assert parse_cdx_line(raw + " extra_field") == correct
+
+def test_invalid_cdx():
+
+ print("missing warc")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
+ assert parse_cdx_line(raw) == None
+
+ print("bad datetime")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) == None