diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-31 16:28:29 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-31 16:28:29 -0800 |
commit | c7de3e448a3aa197236f2ff8a254b207a103f95f (patch) | |
tree | 68851f0dea892d50a3dfcfcc9234ed53fa9c01d2 | |
parent | a6d8ea8068109bd0d26d11e47d04249e81b485b2 (diff) | |
download | arabesque-master.tar.gz arabesque-master.zip |
-rwxr-xr-x | arabesque.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/arabesque.py b/arabesque.py index e67bc0b..721b74f 100755 --- a/arabesque.py +++ b/arabesque.py @@ -452,8 +452,8 @@ def backward(log_file, map_db, output_db, hit_mimetypes=FULLTEXT_MIMETYPES): # convert to IA CDX timestamp format #final_timestamp = dateutil.parser.parse(line.timestamp).strftime("%Y%m%d%H%M%S") final_timestamp = None - if len(line.timestamp) >= 12 and line.timestamp[4] != '-': - final_timestamp = line.timestamp[:12] + if len(line.timestamp) >= 14 and line.timestamp[4] != '-': + final_timestamp = line.timestamp[:14] c.execute("INSERT INTO crawl_result VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)", (row.url, None, initial_domain, final_row.breadcrumbs, final_row.url, final_domain, final_timestamp, final_row.status_code, line.sha1, final_row.mimetype, final_row.is_dedupe, True, None)) #print(final_row.breadcrumbs) |