diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/access.py | 12 |
2 files changed, 13 insertions, 5 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 579dd505..92798a99 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -263,10 +263,12 @@ def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None): return f.name -def ftpretr_via_http_proxy(url, proxy_hostport="159.69.240.245:15201", max_retries=10, retry_delay=1): +def ftpretr_via_http_proxy(url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retries=10, retry_delay=1): """ Fetch file from FTP via external HTTP proxy, e.g. ftp.host.com:/a/b/c would - be retrievable via proxy.com/a/b/c. + be retrievable via proxy.com/a/b/c; (in 09/2021 we used + "159.69.240.245:15201" as proxy_hostport but that started to fail + 2021-10-15; just switch to NIH's http version). """ parsed = urlparse(url) server, path = parsed.netloc, parsed.path diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py index 5ed64c7c..39d4c6d3 100644 --- a/python/fatcat_tools/transforms/access.py +++ b/python/fatcat_tools/transforms/access.py @@ -36,10 +36,16 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]: """ Extracts access options from a release. - TODO: proper implementation + TODO: proper implementation and filtering, instead of just returning first + option found """ options = [] for f in (release.files or []): + thumbnail_url = None + if f.mimetype == 'application/pdf' and f.sha1 and f.urls: + # NOTE: scholar.archive.org does an actual database check before + # generating these URLs, but we skip that for speed + thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{f.sha1[0:2]}/{f.sha1[2:4]}/{f.sha1}.180px.jpg" for u in (f.urls or []): if '://web.archive.org/' in u.url: return [AccessOption( @@ -47,7 +53,7 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]: access_url=u.url, mimetype=f.mimetype, size_bytes=f.size, - thumbnail_url=None + thumbnail_url=thumbnail_url, )] elif '://archive.org/' in u.url: return [AccessOption( @@ -55,6 +61,6 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]: access_url=u.url, mimetype=f.mimetype, size_bytes=f.size, - thumbnail_url=None + thumbnail_url=thumbnail_url, )] return options |