aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/misc.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-26 12:00:01 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-26 12:00:01 -0700
commit37bf997dc0220a30605249655056e90f04e33366 (patch)
tree3f6a3586462d25c02b5fd219b0c754aef2976e3c /python/sandcrawler/misc.py
parentc3c5a6ef57e83ff4395f9f87e7e372c6c371e4a5 (diff)
downloadsandcrawler-37bf997dc0220a30605249655056e90f04e33366.tar.gz
sandcrawler-37bf997dc0220a30605249655056e90f04e33366.zip
lots of grobid tool implementation (still WIP)
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r--python/sandcrawler/misc.py16
1 files changed, 11 insertions, 5 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index f741f93..4ffc5d7 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -75,6 +75,11 @@ def test_normalize_mime():
def parse_cdx_line(raw_cdx, normalize=True):
+ """
+ This method always filters a few things out:
+
+ - non-HTTP requests, based on lack of status code (eg, whois)
+ """
cdx = raw_cdx.split()
if len(cdx) < 11:
@@ -84,8 +89,6 @@ def parse_cdx_line(raw_cdx, normalize=True):
dt = cdx[1]
url = cdx[2]
mime = normalize_mime(cdx[3])
- if normalize:
- mime = normalize_mime(mime)
http_status = cdx[4]
sha1b32 = cdx[5]
c_size = cdx[8]
@@ -102,6 +105,9 @@ def parse_cdx_line(raw_cdx, normalize=True):
if mime is None or mime == '-':
mime = "application/octet-stream"
+ if normalize:
+ mime = normalize_mime(mime)
+
sha1hex = b32_hex(sha1b32)
http_status = int(http_status)
c_size = int(c_size)
@@ -115,9 +121,9 @@ def parse_cdx_line(raw_cdx, normalize=True):
http_status=http_status,
sha1b32=sha1b32,
sha1hex=sha1hex,
- c_size=c_size,
- offset=offset,
- warc=warc,
+ warc_csize=c_size,
+ warc_offset=offset,
+ warc_path=warc,
)
def parse_cdx_datetime(dt_str):