ingest: fix html PDF extraction exception catch behavior

author: Bryan Newbold <bnewbold@archive.org> 2021-05-24 16:26:40 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-05-24 16:26:40 -0700
commit: b5267079739b1155648686b89f32c0ea3e9acbfd (patch)
tree: 4e9b7a2bd0c577701d4994064472e4d4d7c25d6c /python/sandcrawler/html.py
parent: 1263ee33535d232d702324980e7ff69305ed8795 (diff)
download: sandcrawler-b5267079739b1155648686b89f32c0ea3e9acbfd.tar.gz
sandcrawler-b5267079739b1155648686b89f32c0ea3e9acbfd.zip
1 files changed, 2 insertions, 3 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index ca600e4..e3d95bc 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -103,11 +103,10 @@ def extract_fulltext_url(html_url, html_body):
             json_text = json_tag.string
             json_meta = json.loads(json_text)
             pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
-            print(pdf_meta, file=sys.stderr)
             # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
             url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
-        except Exception as e:
-            raise e
+        except (KeyError, TypeError, json.JSONDecodeError):
+            pass
         if url:
             return dict(pdf_url=url, technique="sciencedirect-munge-json")
author	Bryan Newbold <bnewbold@archive.org>	2021-05-24 16:26:40 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-05-24 16:26:40 -0700
commit	b5267079739b1155648686b89f32c0ea3e9acbfd (patch)
tree	4e9b7a2bd0c577701d4994064472e4d4d7c25d6c /python/sandcrawler/html.py
parent	1263ee33535d232d702324980e7ff69305ed8795 (diff)
download	sandcrawler-b5267079739b1155648686b89f32c0ea3e9acbfd.tar.gz sandcrawler-b5267079739b1155648686b89f32c0ea3e9acbfd.zip