diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-09-26 12:00:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-09-26 12:00:01 -0700 |
commit | 37bf997dc0220a30605249655056e90f04e33366 (patch) | |
tree | 3f6a3586462d25c02b5fd219b0c754aef2976e3c /python/sandcrawler/misc.py | |
parent | c3c5a6ef57e83ff4395f9f87e7e372c6c371e4a5 (diff) | |
download | sandcrawler-37bf997dc0220a30605249655056e90f04e33366.tar.gz sandcrawler-37bf997dc0220a30605249655056e90f04e33366.zip |
lots of grobid tool implementation (still WIP)
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r-- | python/sandcrawler/misc.py | 16 |
1 files changed, 11 insertions, 5 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index f741f93..4ffc5d7 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -75,6 +75,11 @@ def test_normalize_mime(): def parse_cdx_line(raw_cdx, normalize=True): + """ + This method always filters a few things out: + + - non-HTTP requests, based on lack of status code (eg, whois) + """ cdx = raw_cdx.split() if len(cdx) < 11: @@ -84,8 +89,6 @@ def parse_cdx_line(raw_cdx, normalize=True): dt = cdx[1] url = cdx[2] mime = normalize_mime(cdx[3]) - if normalize: - mime = normalize_mime(mime) http_status = cdx[4] sha1b32 = cdx[5] c_size = cdx[8] @@ -102,6 +105,9 @@ def parse_cdx_line(raw_cdx, normalize=True): if mime is None or mime == '-': mime = "application/octet-stream" + if normalize: + mime = normalize_mime(mime) + sha1hex = b32_hex(sha1b32) http_status = int(http_status) c_size = int(c_size) @@ -115,9 +121,9 @@ def parse_cdx_line(raw_cdx, normalize=True): http_status=http_status, sha1b32=sha1b32, sha1hex=sha1hex, - c_size=c_size, - offset=offset, - warc=warc, + warc_csize=c_size, + warc_offset=offset, + warc_path=warc, ) def parse_cdx_datetime(dt_str): |