diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-01 15:30:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-01 15:30:05 -0700 |
commit | 50f81581a3d50a2a6a6fb0e55fbdf178897b2818 (patch) | |
tree | 87fd23bd6c8d52e2ded3cfe9bb1354bf238f84a6 /fatcat_covid19 | |
parent | 11112bf1c52827cede04a88c742de507ed3b0404 (diff) | |
download | fatcat-covid19-50f81581a3d50a2a6a6fb0e55fbdf178897b2818.tar.gz fatcat-covid19-50f81581a3d50a2a6a6fb0e55fbdf178897b2818.zip |
start python module directory
Diffstat (limited to 'fatcat_covid19')
-rw-r--r-- | fatcat_covid19/__init__.py | 0 | ||||
-rw-r--r-- | fatcat_covid19/common.py | 88 |
2 files changed, 88 insertions, 0 deletions
diff --git a/fatcat_covid19/__init__.py b/fatcat_covid19/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/fatcat_covid19/__init__.py diff --git a/fatcat_covid19/common.py b/fatcat_covid19/common.py new file mode 100644 index 0000000..97bc675 --- /dev/null +++ b/fatcat_covid19/common.py @@ -0,0 +1,88 @@ + +""" +Helper routines. + +Many of these copied verbatim from fatcat or sandcrawler repositories. +""" + +import os +import sys +import copy +import json +import magic +import hashlib + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error + + +def gen_file_metadata(blob): + """ + Takes a file blob (bytestream) and returns hashes and other metadata. + + Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype + """ + assert blob + mimetype = magic.Magic(mime=True).from_buffer(blob) + hashes = [ + hashlib.sha1(), + hashlib.sha256(), + hashlib.md5(), + ] + for h in hashes: + h.update(blob) + return dict( + size_bytes=len(blob), + sha1hex=hashes[0].hexdigest(), + sha256hex=hashes[1].hexdigest(), + md5hex=hashes[2].hexdigest(), + mimetype=mimetype, + ) + +def requests_retry_session(retries=2, backoff_factor=3, + status_forcelist=(500, 502, 504), session=None): + """ + From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests + """ + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + +def blob_path(sha1hex, directory="", file_suffix="", base_dir="."): + """ + directory: eg, "png/" + sha1hex + file_suffix: eg, ".png" + """ + fpath = "{}/{}{}/{}{}".format( + base_dir, + directory, + sha1hex[0:2], + sha1hex, + file_suffix) + return fpath + +def find_local_file(files, base_dir="."): + """ + Takes a list of fatcat file entities (as dicts), and looks for a local file (PDF). + + If none found, returns None. + If found, returns the file entity; the path can be determined from the sha1hex field. + """ + for f in files: + if f['mimetype'] and not 'pdf' in f['mimetype'].lower(): + continue + pdf_path = blob_path(f['sha1'], directory="pdf/", file_suffix=".pdf", base_dir=base_dir) + if os.path.isfile(pdf_path): + return copy.deepcopy(f) + return None |