aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-31 16:28:29 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-31 16:28:29 -0800
commitc7de3e448a3aa197236f2ff8a254b207a103f95f (patch)
tree68851f0dea892d50a3dfcfcc9234ed53fa9c01d2
parenta6d8ea8068109bd0d26d11e47d04249e81b485b2 (diff)
downloadarabesque-master.tar.gz
arabesque-master.zip
fix IA datetime length (14 not 12)HEADmaster
-rwxr-xr-xarabesque.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/arabesque.py b/arabesque.py
index e67bc0b..721b74f 100755
--- a/arabesque.py
+++ b/arabesque.py
@@ -452,8 +452,8 @@ def backward(log_file, map_db, output_db, hit_mimetypes=FULLTEXT_MIMETYPES):
# convert to IA CDX timestamp format
#final_timestamp = dateutil.parser.parse(line.timestamp).strftime("%Y%m%d%H%M%S")
final_timestamp = None
- if len(line.timestamp) >= 12 and line.timestamp[4] != '-':
- final_timestamp = line.timestamp[:12]
+ if len(line.timestamp) >= 14 and line.timestamp[4] != '-':
+ final_timestamp = line.timestamp[:14]
c.execute("INSERT INTO crawl_result VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)",
(row.url, None, initial_domain, final_row.breadcrumbs, final_row.url, final_domain, final_timestamp, final_row.status_code, line.sha1, final_row.mimetype, final_row.is_dedupe, True, None))
#print(final_row.breadcrumbs)