diff options
Diffstat (limited to 'python/sandcrawler')
-rw-r--r-- | python/sandcrawler/__init__.py | 3 | ||||
-rw-r--r-- | python/sandcrawler/grobid.py | 44 | ||||
-rw-r--r-- | python/sandcrawler/misc.py | 43 |
3 files changed, 90 insertions, 0 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py new file mode 100644 index 0000000..0120287 --- /dev/null +++ b/python/sandcrawler/__init__.py @@ -0,0 +1,3 @@ + +from .grobid import GrobidClient +from .misc import gen_file_metadata, b32_hex diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py new file mode 100644 index 0000000..0e37c0e --- /dev/null +++ b/python/sandcrawler/grobid.py @@ -0,0 +1,44 @@ + +import requests + +class GrobidClient(object): + + def __init__(self, host_uri, **kwargs): + self.host_uri = host_uri + self.consolidate_mode = int(kwargs.get('consolidate_mode', 1)) + + def process_fulltext(self, blob, consolidate_mode=None): + """ + Returns dict with keys: + - status_code + - status (slug) + - error_msg (if status == 'error') + - tei_xml (if status is 200) + + TODO: persist connection for performance? + """ + assert blob + + if consolidate_mode == None: + consolidate_mode = self.consolidate_mode + + grobid_response = requests.post( + self.host_uri + "/api/processFulltextDocument", + files={ + 'input': blob, + 'consolidate_mode': self.consolidate_mode, + } + ) + + info = dict( + status_code=grobid_response.status_code, + ) + if grobid_response.status_code == 200: + info['status'] = 'success' + info['tei_xml'] = grobid_response.text + else: + # response.text is .content decoded as utf-8 + info['status'] = 'error' + info['error_msg'] = grobid_response.text[:10000] + return info + diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py new file mode 100644 index 0000000..e13b5e7 --- /dev/null +++ b/python/sandcrawler/misc.py @@ -0,0 +1,43 @@ + +import base64 +import magic +import hashlib + +def gen_file_metadata(blob): + """ + Takes a file blob (bytestream) and returns hashes and other metadata. + + Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype + """ + assert blob + mimetype = magic.Magic(mime=True).from_buffer(blob) + hashes = [ + hashlib.sha1(), + hashlib.sha256(), + hashlib.md5(), + ] + for h in hashes: + h.update(blob) + return dict( + size_bytes=len(blob), + sha1hex=hashes[0].hexdigest(), + sha256hex=hashes[1].hexdigest(), + md5hex=hashes[2].hexdigest(), + mimetype=mimetype, + ) + +def b32_hex(s): + """ + Converts a base32-encoded SHA-1 checksum into hex-encoded + + base32 checksums are used by, eg, heritrix and in wayback CDX files + """ + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + if len(s) == 40: + return s + raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + |