From c69a8dadb0426fec10fe38474c2f37ceaebdf316 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 4 Apr 2022 17:32:53 -0700 Subject: ingest: drive.google.com ingest support --- python/sandcrawler/html.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 50183be..6e5d2d6 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -343,6 +343,14 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: url = html_url + "pdf" return dict(pdf_url=url, technique="jmir-url") + # Google Drive + # this is assuming it is a PDF + if "drive.google.com/file/d/" in html_url and "/view" in html_url: + gdrive_id = html_url.split('/')[5] + if len(gdrive_id) > 10: + # https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24 + return dict(pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}", technique="google-drive") + ### below here we are doing guesses # generic guess: try current URL plus .pdf, if it exists in the HTML body -- cgit v1.2.3