diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-09-26 12:00:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-09-26 12:00:01 -0700 |
commit | 37bf997dc0220a30605249655056e90f04e33366 (patch) | |
tree | 3f6a3586462d25c02b5fd219b0c754aef2976e3c /python/tests/test_misc.py | |
parent | c3c5a6ef57e83ff4395f9f87e7e372c6c371e4a5 (diff) | |
download | sandcrawler-37bf997dc0220a30605249655056e90f04e33366.tar.gz sandcrawler-37bf997dc0220a30605249655056e90f04e33366.zip |
lots of grobid tool implementation (still WIP)
Diffstat (limited to 'python/tests/test_misc.py')
-rw-r--r-- | python/tests/test_misc.py | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 02deec9..420bc07 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -50,9 +50,9 @@ def test_parse_cdx_line(): 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", 'datetime': "20170828233154", - 'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", - 'offset': 931661233, - 'c_size': 210251, + 'warc_path': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", + 'warc_offset': 931661233, + 'warc_csize': 210251, 'http_status': 200, } |