diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-02 14:50:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-02 14:50:05 -0700 |
commit | f1c4935b2d55f3acecd824ba9b6318a33820fafe (patch) | |
tree | e7e5496dfd39a891f33c78599253ba2755ec5e91 /backfill/backfill_hbase_from_cdx.py | |
parent | 31d5a1ebdfe2f4638ae1e5ec87ff228eef9114f5 (diff) | |
download | sandcrawler-f1c4935b2d55f3acecd824ba9b6318a33820fafe.tar.gz sandcrawler-f1c4935b2d55f3acecd824ba9b6318a33820fafe.zip |
heritrix expects ints, not strings, for numbers
Diffstat (limited to 'backfill/backfill_hbase_from_cdx.py')
-rwxr-xr-x | backfill/backfill_hbase_from_cdx.py | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/backfill/backfill_hbase_from_cdx.py b/backfill/backfill_hbase_from_cdx.py index 8008761..04ae7d9 100755 --- a/backfill/backfill_hbase_from_cdx.py +++ b/backfill/backfill_hbase_from_cdx.py @@ -76,8 +76,8 @@ def transform_line(raw_cdx): key = "sha1:{}".format(key) - info = dict(surt=surt, dt=dt, url=url, c_size=c_size, offset=offset, - warc=warc) + info = dict(surt=surt, dt=dt, url=url, c_size=int(c_size), + offset=int(offset), warc=warc) warc_file = warc.split('/')[-1] dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat() @@ -87,7 +87,7 @@ def transform_line(raw_cdx): return None # 'i' intentionally not set - heritrix = dict(u=url, d=dt_iso, f=warc_file, o=offset, c="1") + heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1) return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix} def test_transform_line(): @@ -101,15 +101,15 @@ def test_transform_line(): 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", 'dt': "20170828233154", 'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", - 'offset': "931661233", - 'c_size': "210251", + 'offset': 931661233, + 'c_size': 210251, }, 'f:c': { 'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", 'd': "2017-08-28T23:31:54", 'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", - 'o': "931661233", - 'c': "1", + 'o': 931661233, + 'c': 1, } } |