aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-12-12 17:49:41 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-12-12 19:49:50 -0800
commit0827b67a3f195f151a77ca01708e6c98daf778bf (patch)
tree792fb856174634cad87b55ba74c1f15b9f561bad
parente5d0d98d0377c5833dc4fedb6d8df14f5489edb5 (diff)
downloadfatcat-0827b67a3f195f151a77ca01708e6c98daf778bf.tar.gz
fatcat-0827b67a3f195f151a77ca01708e6c98daf778bf.zip
tweaks to ingest-file transform
-rw-r--r--python/fatcat_tools/transforms/ingest.py20
1 files changed, 7 insertions, 13 deletions
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index c2ae6e0f..988f80a2 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -1,7 +1,7 @@
from .elasticsearch import release_to_elasticsearch
-def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat'):
+def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat', ingest_type='pdf'):
"""
Takes a full release entity object and returns an ingest request (as dict),
or None if it seems like this release shouldn't be ingested.
@@ -25,25 +25,20 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat
# generate a URL where we expect to find fulltext
url = None
- expect_mimetypes = []
if release.ext_ids.arxiv:
url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv)
- expect_mimetypes = ['application/pdf']
+ elif release.ext_ids.doi:
+ url = "https://doi.org/{}".format(release.ext_ids.doi)
elif release.ext_ids.pmcid:
+ # TODO: how to tell if an author manuscript in PMC vs. published?
#url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid)
- expect_mimetypes = ['application/pdf']
- elif release.ext_ids.doi:
- url = "https://doi.org/{}".format(release.ext_ids.doi)
if not url:
return None
- ext_ids = dict()
- for k in ('doi', 'pmid', 'pmcid', 'arxiv'):
- v = getattr(release.ext_ids, k)
- if v:
- ext_ids[k] = v
+ ext_ids = release.ext_ids.to_dict()
+ ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v])
if oa_only and not ext_ids.get('arxiv') and not ext_ids.get('pmcid'):
es = release_to_elasticsearch(release)
@@ -51,7 +46,7 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat
return None
ingest_request = {
- 'ingest_type': 'file',
+ 'ingest_type': ingest_type,
'ingest_request_source': ingest_request_source,
'base_url': url,
'fatcat': {
@@ -60,7 +55,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat
'work_ident': release.work_id,
},
'ext_ids': ext_ids,
- 'expect_mimetypes': expect_mimetypes or None,
}
return ingest_request