aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xbackfill/backfill_hbase_from_cdx.py14
1 files changed, 7 insertions, 7 deletions
diff --git a/backfill/backfill_hbase_from_cdx.py b/backfill/backfill_hbase_from_cdx.py
index 8008761..04ae7d9 100755
--- a/backfill/backfill_hbase_from_cdx.py
+++ b/backfill/backfill_hbase_from_cdx.py
@@ -76,8 +76,8 @@ def transform_line(raw_cdx):
key = "sha1:{}".format(key)
- info = dict(surt=surt, dt=dt, url=url, c_size=c_size, offset=offset,
- warc=warc)
+ info = dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
+ offset=int(offset), warc=warc)
warc_file = warc.split('/')[-1]
dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
@@ -87,7 +87,7 @@ def transform_line(raw_cdx):
return None
# 'i' intentionally not set
- heritrix = dict(u=url, d=dt_iso, f=warc_file, o=offset, c="1")
+ heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix}
def test_transform_line():
@@ -101,15 +101,15 @@ def test_transform_line():
'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
'dt': "20170828233154",
'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'offset': "931661233",
- 'c_size': "210251",
+ 'offset': 931661233,
+ 'c_size': 210251,
},
'f:c': {
'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
'd': "2017-08-28T23:31:54",
'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
- 'o': "931661233",
- 'c': "1",
+ 'o': 931661233,
+ 'c': 1,
}
}