aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/harvest/pubmed.py6
-rw-r--r--python/fatcat_tools/transforms/access.py12
2 files changed, 13 insertions, 5 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py
index 579dd505..92798a99 100644
--- a/python/fatcat_tools/harvest/pubmed.py
+++ b/python/fatcat_tools/harvest/pubmed.py
@@ -263,10 +263,12 @@ def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None):
return f.name
-def ftpretr_via_http_proxy(url, proxy_hostport="159.69.240.245:15201", max_retries=10, retry_delay=1):
+def ftpretr_via_http_proxy(url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retries=10, retry_delay=1):
"""
Fetch file from FTP via external HTTP proxy, e.g. ftp.host.com:/a/b/c would
- be retrievable via proxy.com/a/b/c.
+ be retrievable via proxy.com/a/b/c; (in 09/2021 we used
+ "159.69.240.245:15201" as proxy_hostport but that started to fail
+ 2021-10-15; just switch to NIH's http version).
"""
parsed = urlparse(url)
server, path = parsed.netloc, parsed.path
diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py
index 5ed64c7c..39d4c6d3 100644
--- a/python/fatcat_tools/transforms/access.py
+++ b/python/fatcat_tools/transforms/access.py
@@ -36,10 +36,16 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]:
"""
Extracts access options from a release.
- TODO: proper implementation
+ TODO: proper implementation and filtering, instead of just returning first
+ option found
"""
options = []
for f in (release.files or []):
+ thumbnail_url = None
+ if f.mimetype == 'application/pdf' and f.sha1 and f.urls:
+ # NOTE: scholar.archive.org does an actual database check before
+ # generating these URLs, but we skip that for speed
+ thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{f.sha1[0:2]}/{f.sha1[2:4]}/{f.sha1}.180px.jpg"
for u in (f.urls or []):
if '://web.archive.org/' in u.url:
return [AccessOption(
@@ -47,7 +53,7 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]:
access_url=u.url,
mimetype=f.mimetype,
size_bytes=f.size,
- thumbnail_url=None
+ thumbnail_url=thumbnail_url,
)]
elif '://archive.org/' in u.url:
return [AccessOption(
@@ -55,6 +61,6 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]:
access_url=u.url,
mimetype=f.mimetype,
size_bytes=f.size,
- thumbnail_url=None
+ thumbnail_url=thumbnail_url,
)]
return options