aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/sandcrawler.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-29 17:05:32 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-29 17:05:35 -0700
commitc4f5ba60cf3581dc58875a4e56f8964560496753 (patch)
tree43795fa1223961f28f2fac52a29e5e155a85f77f /fatcat_scholar/sandcrawler.py
parent0f8b248259b4b57e425f7420883cb141565b2b22 (diff)
downloadfatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.tar.gz
fatcat-scholar-c4f5ba60cf3581dc58875a4e56f8964560496753.zip
fetch pdftotext and pdf_meta from blobs, postgrest
This replaces the temporary COVID-19 content hack with production content (text, thumbnail URLs) stored in postgrest and seaweedfs.
Diffstat (limited to 'fatcat_scholar/sandcrawler.py')
-rw-r--r--fatcat_scholar/sandcrawler.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 6f0f85c..9c48cd9 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -16,6 +16,15 @@ class SandcrawlerPostgrestClient:
else:
return None
+ def get_pdf_meta(self, sha1: str) -> Optional[Dict[str, Any]]:
+ resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1))
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
class SandcrawlerMinioClient(object):
def __init__(