path: root/fatcat_covid19/common.py
diff options
Diffstat (limited to 'fatcat_covid19/common.py')
1 files changed, 88 insertions, 0 deletions
diff --git a/fatcat_covid19/common.py b/fatcat_covid19/common.py
new file mode 100644
index 0000000..97bc675
--- /dev/null
+++ b/fatcat_covid19/common.py
@@ -0,0 +1,88 @@
+Helper routines.
+Many of these copied verbatim from fatcat or sandcrawler repositories.
+import os
+import sys
+import copy
+import json
+import magic
+import hashlib
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+def gen_file_metadata(blob):
+ """
+ Takes a file blob (bytestream) and returns hashes and other metadata.
+ Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype
+ """
+ assert blob
+ mimetype = magic.Magic(mime=True).from_buffer(blob)
+ hashes = [
+ hashlib.sha1(),
+ hashlib.sha256(),
+ hashlib.md5(),
+ ]
+ for h in hashes:
+ h.update(blob)
+ return dict(
+ size_bytes=len(blob),
+ sha1hex=hashes[0].hexdigest(),
+ sha256hex=hashes[1].hexdigest(),
+ md5hex=hashes[2].hexdigest(),
+ mimetype=mimetype,
+ )
+def requests_retry_session(retries=2, backoff_factor=3,
+ status_forcelist=(500, 502, 504), session=None):
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = session or requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount('http://', adapter)
+ session.mount('https://', adapter)
+ return session
+def blob_path(sha1hex, directory="", file_suffix="", base_dir="."):
+ """
+ directory: eg, "png/"
+ sha1hex
+ file_suffix: eg, ".png"
+ """
+ fpath = "{}/{}{}/{}{}".format(
+ base_dir,
+ directory,
+ sha1hex[0:2],
+ sha1hex,
+ file_suffix)
+ return fpath
+def find_local_file(files, base_dir="."):
+ """
+ Takes a list of fatcat file entities (as dicts), and looks for a local file (PDF).
+ If none found, returns None.
+ If found, returns the file entity; the path can be determined from the sha1hex field.
+ """
+ for f in files:
+ if f['mimetype'] and not 'pdf' in f['mimetype'].lower():
+ continue
+ pdf_path = blob_path(f['sha1'], directory="pdf/", file_suffix=".pdf", base_dir=base_dir)
+ if os.path.isfile(pdf_path):
+ return copy.deepcopy(f)
+ return None