1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
"""
Helper routines.
Many of these copied verbatim from fatcat or sandcrawler repositories.
"""
import os
import sys
import copy
import json
import magic
import hashlib
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
def gen_file_metadata(blob):
"""
Takes a file blob (bytestream) and returns hashes and other metadata.
Returns a dict: size_bytes, md5hex, sha1hex, sha256hex, mimetype
"""
assert blob
mimetype = magic.Magic(mime=True).from_buffer(blob)
hashes = [
hashlib.sha1(),
hashlib.sha256(),
hashlib.md5(),
]
for h in hashes:
h.update(blob)
return dict(
size_bytes=len(blob),
sha1hex=hashes[0].hexdigest(),
sha256hex=hashes[1].hexdigest(),
md5hex=hashes[2].hexdigest(),
mimetype=mimetype,
)
def requests_retry_session(retries=2, backoff_factor=3,
status_forcelist=(500, 502, 504), session=None):
"""
From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
"""
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def blob_path(sha1hex, directory="", file_suffix="", base_dir="."):
"""
directory: eg, "png/"
sha1hex
file_suffix: eg, ".png"
"""
fpath = "{}/{}{}/{}{}".format(
base_dir,
directory,
sha1hex[0:2],
sha1hex,
file_suffix)
return fpath
def find_local_file(files, base_dir="."):
"""
Takes a list of fatcat file entities (as dicts), and looks for a local file (PDF).
If none found, returns None.
If found, returns the file entity; the path can be determined from the sha1hex field.
"""
for f in files:
if f['mimetype'] and not 'pdf' in f['mimetype'].lower():
continue
pdf_path = blob_path(f['sha1'], directory="pdf/", file_suffix=".pdf", base_dir=base_dir)
if os.path.isfile(pdf_path):
return copy.deepcopy(f)
return None
|