aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-23 22:58:55 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-23 22:58:55 -0700
commitb438f52dbb7578c9a5c2153bc4ba50e33fdae7c3 (patch)
treea47d7c07aebd01e3d26c86e664fa0f59b0786bcc
parente2508ee3da64b46f47aec25361839f29de5e73c0 (diff)
downloadsandcrawler-b438f52dbb7578c9a5c2153bc4ba50e33fdae7c3.tar.gz
sandcrawler-b438f52dbb7578c9a5c2153bc4ba50e33fdae7c3.zip
start refactoring sandcrawler python common code
-rw-r--r--python/sandcrawler/__init__.py3
-rw-r--r--python/sandcrawler/grobid.py44
-rw-r--r--python/sandcrawler/misc.py43
-rw-r--r--python/tests/files/dummy.pdfbin0 -> 13264 bytes
-rw-r--r--python/tests/test_misc.py41
5 files changed, 131 insertions, 0 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
new file mode 100644
index 0000000..0120287
--- /dev/null
+++ b/python/sandcrawler/__init__.py
@@ -0,0 +1,3 @@
+
+from .grobid import GrobidClient
+from .misc import gen_file_metadata, b32_hex
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
new file mode 100644
index 0000000..0e37c0e
--- /dev/null
+++ b/python/sandcrawler/grobid.py
@@ -0,0 +1,44 @@
+
+import requests
+
+class GrobidClient(object):
+
+ def __init__(self, host_uri, **kwargs):
+ self.host_uri = host_uri
+ self.consolidate_mode = int(kwargs.get('consolidate_mode', 1))
+
+ def process_fulltext(self, blob, consolidate_mode=None):
+ """
+ Returns dict with keys:
+ - status_code
+ - status (slug)
+ - error_msg (if status == 'error')
+ - tei_xml (if status is 200)
+
+ TODO: persist connection for performance?
+ """
+ assert blob
+
+ if consolidate_mode == None:
+ consolidate_mode = self.consolidate_mode
+
+ grobid_response = requests.post(
+ self.host_uri + "/api/processFulltextDocument",
+ files={
+ 'input': blob,
+ 'consolidate_mode': self.consolidate_mode,
+ }
+ )
+
+ info = dict(
+ status_code=grobid_response.status_code,
+ )
+ if grobid_response.status_code == 200:
+ info['status'] = 'success'
+ info['tei_xml'] = grobid_response.text
+ else:
+ # response.text is .content decoded as utf-8
+ info['status'] = 'error'
+ info['error_msg'] = grobid_response.text[:10000]
+ return info
+
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
new file mode 100644
index 0000000..e13b5e7
--- /dev/null
+++ b/python/sandcrawler/misc.py
@@ -0,0 +1,43 @@
+
+import base64
+import magic
+import hashlib
+
+def gen_file_metadata(blob):
+ """
+ Takes a file blob (bytestream) and returns hashes and other metadata.
+
+ Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype
+ """
+ assert blob
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ for h in hashes:
+ h.update(blob)
+ return dict(
+ size_bytes=len(blob),
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+
+def b32_hex(s):
+ """
+ Converts a base32-encoded SHA-1 checksum into hex-encoded
+
+ base32 checksums are used by, eg, heritrix and in wayback CDX files
+ """
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ if len(s) == 40:
+ return s
+ raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s))
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
diff --git a/python/tests/files/dummy.pdf b/python/tests/files/dummy.pdf
new file mode 100644
index 0000000..774c2ea
--- /dev/null
+++ b/python/tests/files/dummy.pdf
Binary files differ
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
new file mode 100644
index 0000000..a7879c8
--- /dev/null
+++ b/python/tests/test_misc.py
@@ -0,0 +1,41 @@
+
+import pytest
+
+from sandcrawler import gen_file_metadata, b32_hex
+
+def test_gen_file_metadata():
+
+ # valid (but very small) PDF file
+ with open('tests/files/dummy.pdf', 'rb') as f:
+ file_meta = gen_file_metadata(f.read())
+ assert file_meta == {
+ 'mimetype': 'application/pdf',
+ 'md5hex': '2942bfabb3d05332b66eb128e0842cff',
+ 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36',
+ 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4',
+ 'size_bytes': 13264,
+ }
+
+ # valid HTML
+ fm = gen_file_metadata(
+ b"""<html><head><title>dummy</title></head><body>html document</body></html>""")
+ assert fm['mimetype'] == 'text/html'
+
+ # bogus text
+ fm = gen_file_metadata(b"asdf1234")
+ assert fm['mimetype'] == 'text/plain'
+ assert fm['size_bytes'] == 8
+
+def test_b32_hex():
+
+ # valid b32
+ assert b32_hex('sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+ assert b32_hex('TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982'
+
+ # sha1hex pass-through
+ s = 'bda3c1017d52e826bbd1da51efad877272d300f9'
+ assert b32_hex(s) == s
+
+ # invalid
+ with pytest.raises(ValueError):
+ assert b32_hex('blah') == 'blah'