diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 11:26:11 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 11:28:45 -0700 | 
| commit | 84169ce643075f1e49b18744d5609c7f1c48e7f7 (patch) | |
| tree | 887a2e63842f54da5fa11f371739fb5be7af8a55 /python | |
| parent | 2bdf3ac9b2716752371f4ed54f537c08ca270b53 (diff) | |
| download | sandcrawler-84169ce643075f1e49b18744d5609c7f1c48e7f7.tar.gz sandcrawler-84169ce643075f1e49b18744d5609c7f1c48e7f7.zip  | |
remove unused common.py
Diffstat (limited to 'python')
| -rw-r--r-- | python/common.py | 99 | ||||
| -rw-r--r-- | python/tests/test_common.py | 40 | 
2 files changed, 0 insertions, 139 deletions
diff --git a/python/common.py b/python/common.py deleted file mode 100644 index e596b35..0000000 --- a/python/common.py +++ /dev/null @@ -1,99 +0,0 @@ - -import json -from datetime import datetime - -NORMAL_MIME = ( -    'application/pdf', -    'application/postscript', -    'text/html', -    'text/xml', -) - -def normalize_mime(raw): -    raw = raw.lower() -    for norm in NORMAL_MIME: -        if raw.startswith(norm): -            return norm - -    # Special cases -    if raw.startswith('application/xml'): -        return 'text/xml' -    if raw.startswith('application/x-pdf'): -        return 'application/pdf' -    return None - - -def test_normalize_mime(): -    assert normalize_mime("asdf") is None -    assert normalize_mime("application/pdf") == "application/pdf" -    assert normalize_mime("application/pdf+journal") == "application/pdf" -    assert normalize_mime("Application/PDF") == "application/pdf" -    assert normalize_mime("application/p") is None -    assert normalize_mime("application/xml+stuff") == "text/xml" -    assert normalize_mime("application/x-pdf") == "application/pdf" -    assert normalize_mime("application/x-html") is None - - -def parse_cdx_line(raw_cdx): - -    cdx = raw_cdx.split() -    if len(cdx) < 11: -        return None - -    surt = cdx[0] -    dt = cdx[1] -    url = cdx[2] -    mime = normalize_mime(cdx[3]) -    http_status = cdx[4] -    key = cdx[5] -    c_size = cdx[8] -    offset = cdx[9] -    warc = cdx[10] - -    if not (key.isalnum() and c_size.isdigit() and offset.isdigit() -            and http_status == "200" and len(key) == 32 and dt.isdigit() -            and mime != None): -        return None - -    if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc): -        return None - -    key = "sha1:{}".format(key) - -    info = dict(surt=surt, dt=dt, url=url, c_size=int(c_size), -        offset=int(offset), warc=warc) - -    warc_file = warc.split('/')[-1] -    try: -        dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat() -    except Exception: -        return None - -    # 'i' intentionally not set -    heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1) -    return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix} - -def parse_ungrobided_line(raw_line): - -    line = raw_line.strip().split("\t") -    if len(line) != 4: -        return None - -    key = line[0] -    mime = normalize_mime(line[2]) -    try: -        f_c = json.loads(line[1]) -        cdx = json.loads(line[3]) -    except json.JSONDecodeError: -        return None - -    if not (key[5:].isalnum() and len(key) == 37 and mime != None): -        print(mime) -        print(key) -        print("FAIL") -        return None - -    if '-' in (key, mime, f_c, cdx): -        return None - -    return {'key': key, 'file:mime': mime, 'file:cdx': cdx, 'f:c': f_c} diff --git a/python/tests/test_common.py b/python/tests/test_common.py deleted file mode 100644 index 34d50ed..0000000 --- a/python/tests/test_common.py +++ /dev/null @@ -1,40 +0,0 @@ - -from common import * - - -def test_parse_cdx_line(): - -    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" -    correct = { -        'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G", -        'file:mime': "application/pdf", -        'file:cdx': { -            'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", -            'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", -            'dt': "20170828233154", -            'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", -            'offset': 931661233, -            'c_size': 210251, -        }, -        'f:c': { -            'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", -            'd': "2017-08-28T23:31:54", -            'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", -            'o': 931661233, -            'c': 1, -        } -    } - -    assert parse_cdx_line(raw) == correct -    assert parse_cdx_line(raw + "\n") == correct -    assert parse_cdx_line(raw + " extra_field") == correct - -def test_invalid_cdx(): - -    print("missing warc") -    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -" -    assert parse_cdx_line(raw) == None - -    print("bad datetime") -    raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"  -    assert parse_cdx_line(raw) == None  | 
