From b438f52dbb7578c9a5c2153bc4ba50e33fdae7c3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 23 Sep 2019 22:58:55 -0700 Subject: start refactoring sandcrawler python common code --- python/sandcrawler/__init__.py | 3 +++ python/sandcrawler/grobid.py | 44 +++++++++++++++++++++++++++++++++++++++++ python/sandcrawler/misc.py | 43 ++++++++++++++++++++++++++++++++++++++++ python/tests/files/dummy.pdf | Bin 0 -> 13264 bytes python/tests/test_misc.py | 41 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 131 insertions(+) create mode 100644 python/sandcrawler/__init__.py create mode 100644 python/sandcrawler/grobid.py create mode 100644 python/sandcrawler/misc.py create mode 100644 python/tests/files/dummy.pdf create mode 100644 python/tests/test_misc.py diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py new file mode 100644 index 0000000..0120287 --- /dev/null +++ b/python/sandcrawler/__init__.py @@ -0,0 +1,3 @@ + +from .grobid import GrobidClient +from .misc import gen_file_metadata, b32_hex diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py new file mode 100644 index 0000000..0e37c0e --- /dev/null +++ b/python/sandcrawler/grobid.py @@ -0,0 +1,44 @@ + +import requests + +class GrobidClient(object): + + def __init__(self, host_uri, **kwargs): + self.host_uri = host_uri + self.consolidate_mode = int(kwargs.get('consolidate_mode', 1)) + + def process_fulltext(self, blob, consolidate_mode=None): + """ + Returns dict with keys: + - status_code + - status (slug) + - error_msg (if status == 'error') + - tei_xml (if status is 200) + + TODO: persist connection for performance? + """ + assert blob + + if consolidate_mode == None: + consolidate_mode = self.consolidate_mode + + grobid_response = requests.post( + self.host_uri + "/api/processFulltextDocument", + files={ + 'input': blob, + 'consolidate_mode': self.consolidate_mode, + } + ) + + info = dict( + status_code=grobid_response.status_code, + ) + if grobid_response.status_code == 200: + info['status'] = 'success' + info['tei_xml'] = grobid_response.text + else: + # response.text is .content decoded as utf-8 + info['status'] = 'error' + info['error_msg'] = grobid_response.text[:10000] + return info + diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py new file mode 100644 index 0000000..e13b5e7 --- /dev/null +++ b/python/sandcrawler/misc.py @@ -0,0 +1,43 @@ + +import base64 +import magic +import hashlib + +def gen_file_metadata(blob): + """ + Takes a file blob (bytestream) and returns hashes and other metadata. + + Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype + """ + assert blob + mimetype = magic.Magic(mime=True).from_buffer(blob) + hashes = [ + hashlib.sha1(), + hashlib.sha256(), + hashlib.md5(), + ] + for h in hashes: + h.update(blob) + return dict( + size_bytes=len(blob), + sha1hex=hashes[0].hexdigest(), + sha256hex=hashes[1].hexdigest(), + md5hex=hashes[2].hexdigest(), + mimetype=mimetype, + ) + +def b32_hex(s): + """ + Converts a base32-encoded SHA-1 checksum into hex-encoded + + base32 checksums are used by, eg, heritrix and in wayback CDX files + """ + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + if len(s) == 40: + return s + raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + diff --git a/python/tests/files/dummy.pdf b/python/tests/files/dummy.pdf new file mode 100644 index 0000000..774c2ea Binary files /dev/null and b/python/tests/files/dummy.pdf differ diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py new file mode 100644 index 0000000..a7879c8 --- /dev/null +++ b/python/tests/test_misc.py @@ -0,0 +1,41 @@ + +import pytest + +from sandcrawler import gen_file_metadata, b32_hex + +def test_gen_file_metadata(): + + # valid (but very small) PDF file + with open('tests/files/dummy.pdf', 'rb') as f: + file_meta = gen_file_metadata(f.read()) + assert file_meta == { + 'mimetype': 'application/pdf', + 'md5hex': '2942bfabb3d05332b66eb128e0842cff', + 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36', + 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4', + 'size_bytes': 13264, + } + + # valid HTML + fm = gen_file_metadata( + b"""dummyhtml document""") + assert fm['mimetype'] == 'text/html' + + # bogus text + fm = gen_file_metadata(b"asdf1234") + assert fm['mimetype'] == 'text/plain' + assert fm['size_bytes'] == 8 + +def test_b32_hex(): + + # valid b32 + assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982' + assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982' + + # sha1hex pass-through + s = 'bda3c1017d52e826bbd1da51efad877272d300f9' + assert b32_hex(s) == s + + # invalid + with pytest.raises(ValueError): + assert b32_hex('blah') == 'blah' -- cgit v1.2.3