diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-12-12 17:49:41 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-12-12 19:49:50 -0800 |
commit | 0827b67a3f195f151a77ca01708e6c98daf778bf (patch) | |
tree | 792fb856174634cad87b55ba74c1f15b9f561bad /python/fatcat_tools/transforms/ingest.py | |
parent | e5d0d98d0377c5833dc4fedb6d8df14f5489edb5 (diff) | |
download | fatcat-0827b67a3f195f151a77ca01708e6c98daf778bf.tar.gz fatcat-0827b67a3f195f151a77ca01708e6c98daf778bf.zip |
tweaks to ingest-file transform
Diffstat (limited to 'python/fatcat_tools/transforms/ingest.py')
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 20 |
1 files changed, 7 insertions, 13 deletions
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index c2ae6e0f..988f80a2 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,7 +1,7 @@ from .elasticsearch import release_to_elasticsearch -def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat'): +def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat', ingest_type='pdf'): """ Takes a full release entity object and returns an ingest request (as dict), or None if it seems like this release shouldn't be ingested. @@ -25,25 +25,20 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat # generate a URL where we expect to find fulltext url = None - expect_mimetypes = [] if release.ext_ids.arxiv: url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) - expect_mimetypes = ['application/pdf'] + elif release.ext_ids.doi: + url = "https://doi.org/{}".format(release.ext_ids.doi) elif release.ext_ids.pmcid: + # TODO: how to tell if an author manuscript in PMC vs. published? #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) - expect_mimetypes = ['application/pdf'] - elif release.ext_ids.doi: - url = "https://doi.org/{}".format(release.ext_ids.doi) if not url: return None - ext_ids = dict() - for k in ('doi', 'pmid', 'pmcid', 'arxiv'): - v = getattr(release.ext_ids, k) - if v: - ext_ids[k] = v + ext_ids = release.ext_ids.to_dict() + ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) if oa_only and not ext_ids.get('arxiv') and not ext_ids.get('pmcid'): es = release_to_elasticsearch(release) @@ -51,7 +46,7 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat return None ingest_request = { - 'ingest_type': 'file', + 'ingest_type': ingest_type, 'ingest_request_source': ingest_request_source, 'base_url': url, 'fatcat': { @@ -60,7 +55,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat 'work_ident': release.work_id, }, 'ext_ids': ext_ids, - 'expect_mimetypes': expect_mimetypes or None, } return ingest_request |