diff options
Diffstat (limited to 'python/fatcat_tools/transforms/ingest.py')
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py new file mode 100644 index 00000000..eee60630 --- /dev/null +++ b/python/fatcat_tools/transforms/ingest.py @@ -0,0 +1,66 @@ + +from .elasticsearch import release_to_elasticsearch + +def release_ingest_request(release, oa_only=False, project='fatcat'): + """ + Takes a full release entity object and returns an ingest request (as dict), + or None if it seems like this release shouldn't be ingested. + + The release entity should have the container, file, fileset, and webcapture + fields set. + + The 'oa_only' boolean flag indicates that we should only return an ingest + request if we have reason to believe this is an OA release (or, eg, in + arxiv or pubmed central). Respecting this flag means we are likely to miss + a lot of "hybrid" and "bronze" content, but could reduce load + significantly. + + The type of the ingest request may depend on release type and container + metadata (eg, as to whether we expect a PDF, datasets, web page), so + calling code should check the returned type field. + """ + + if release.state != 'active': + return None + + # generate a URL where we expect to find fulltext + url = None + expect_mimetypes = [] + if release.ext_ids.arxiv: + url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) + expect_mimetypes = ['application/pdf'] + elif release.ext_ids.pmcid: + #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.pmcid) + url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.pmcid) + expect_mimetypes = ['application/pdf'] + elif release.ext_ids.doi: + url = "https://doi.org/{}".format(release.ext_ids.doi) + + if not url: + return None + + ext_ids = dict() + for k in ('doi', 'pmid', 'pmcid', 'arxiv'): + v = getattr(release.ext_ids, k) + if v: + ext_ids[k] = v + + if oa_only and not ext_ids['arxiv'] and not ext_ids['pmcid']: + es = release_to_elasticsearch(release) + if not es['is_oa']: + return None + + ingest_request = { + 'ingest_type': 'file', + 'project': project, + 'base_url': url, + 'fatcat': { + 'release_stage': release.release_stage, + 'release_ident': release.ident, + 'work_ident': release.work_id, + }, + 'ext_ids': ext_ids, + 'expect_mimetypes': expect_mimetypes or None, + } + return ingest_request + |