diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-20 20:41:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-20 20:41:32 -0700 |
commit | 966df43c77581770df4d83d37afe8ead41d51abb (patch) | |
tree | a03e53fb9058a4094fcfbb8f2d97ba69e1a1df87 /python/sandcrawler/html.py | |
parent | 98b95dea4eafec78f16f6afbabfe65aa2489e78f (diff) | |
download | sandcrawler-966df43c77581770df4d83d37afe8ead41d51abb.tar.gz sandcrawler-966df43c77581770df4d83d37afe8ead41d51abb.zip |
ingest: more PDF fulltext tricks
Diffstat (limited to 'python/sandcrawler/html.py')
-rw-r--r-- | python/sandcrawler/html.py | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 73c808c..207f067 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -323,6 +323,13 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: technique="google-drive", ) + # https://doi.org/10.24850/j-tyca-14-4-7 + # https://docs.google.com/viewer?url=http://revistatyca.org.mx/index.php/tyca/libraryFiles/downloadPublic/150 + if "docs.google.com/viewer?url=" in html_url: + original_url = html_url.split("?url=")[1] + if original_url: + return dict(pdf_url=original_url, technique="docs.google.com viewer") + ### below here we are doing guesses # generic guess: try current URL plus .pdf, if it exists in the HTML body |