diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-06 19:51:00 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-18 11:58:26 -0800 |
commit | 4979c58ee91903148962f4d62d1a8d423349ad67 (patch) | |
tree | 09901de13601d058eb413614a0dc626e1e30f4d2 /fatcat_scholar/sandcrawler.py | |
parent | 7d38f46fc1970decfcfb1e3f4583b85605e5b8ee (diff) | |
download | fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.tar.gz fatcat-scholar-4979c58ee91903148962f4d62d1a8d423349ad67.zip |
add basic html fulltext support to fetch pipeline
Diffstat (limited to 'fatcat_scholar/sandcrawler.py')
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 25c7002..416ed83 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -27,6 +27,17 @@ class SandcrawlerPostgrestClient: else: return None + def get_html_meta(self, sha1: str) -> Optional[Dict[str, Any]]: + resp = requests.get( + self.api_url + "/html_meta", params=dict(sha1hex="eq." + sha1) + ) + resp.raise_for_status() + resp_json = resp.json() + if resp_json: + return resp_json[0] + else: + return None + class SandcrawlerMinioClient(object): def __init__( |