diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-12 21:13:16 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-15 16:46:26 -0800 |
commit | 06cf64414d4c7fa497c9ddb83b7c066b3779c4d2 (patch) | |
tree | 202d60c467b6404202c76c6a03c763ecf5c8186c /python/fatcat_tools/transforms | |
parent | 5038de89f5539cfed150b302d672d5c04f380a65 (diff) | |
download | fatcat-06cf64414d4c7fa497c9ddb83b7c066b3779c4d2.tar.gz fatcat-06cf64414d4c7fa497c9ddb83b7c066b3779c4d2.zip |
add ingest request transform (and test)
Diffstat (limited to 'python/fatcat_tools/transforms')
-rw-r--r-- | python/fatcat_tools/transforms/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 66 |
2 files changed, 67 insertions, 0 deletions
diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py index 735d1b29..6a4b1bba 100644 --- a/python/fatcat_tools/transforms/__init__.py +++ b/python/fatcat_tools/transforms/__init__.py @@ -2,3 +2,4 @@ from .entities import entity_to_dict, entity_from_json, entity_from_dict from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch from .csl import release_to_csl, citeproc_csl +from .ingest import release_ingest_request diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py new file mode 100644 index 00000000..eee60630 --- /dev/null +++ b/python/fatcat_tools/transforms/ingest.py @@ -0,0 +1,66 @@ + +from .elasticsearch import release_to_elasticsearch + +def release_ingest_request(release, oa_only=False, project='fatcat'): + """ + Takes a full release entity object and returns an ingest request (as dict), + or None if it seems like this release shouldn't be ingested. + + The release entity should have the container, file, fileset, and webcapture + fields set. + + The 'oa_only' boolean flag indicates that we should only return an ingest + request if we have reason to believe this is an OA release (or, eg, in + arxiv or pubmed central). Respecting this flag means we are likely to miss + a lot of "hybrid" and "bronze" content, but could reduce load + significantly. + + The type of the ingest request may depend on release type and container + metadata (eg, as to whether we expect a PDF, datasets, web page), so + calling code should check the returned type field. + """ + + if release.state != 'active': + return None + + # generate a URL where we expect to find fulltext + url = None + expect_mimetypes = [] + if release.ext_ids.arxiv: + url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) + expect_mimetypes = ['application/pdf'] + elif release.ext_ids.pmcid: + #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.pmcid) + url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.pmcid) + expect_mimetypes = ['application/pdf'] + elif release.ext_ids.doi: + url = "https://doi.org/{}".format(release.ext_ids.doi) + + if not url: + return None + + ext_ids = dict() + for k in ('doi', 'pmid', 'pmcid', 'arxiv'): + v = getattr(release.ext_ids, k) + if v: + ext_ids[k] = v + + if oa_only and not ext_ids['arxiv'] and not ext_ids['pmcid']: + es = release_to_elasticsearch(release) + if not es['is_oa']: + return None + + ingest_request = { + 'ingest_type': 'file', + 'project': project, + 'base_url': url, + 'fatcat': { + 'release_stage': release.release_stage, + 'release_ident': release.ident, + 'work_ident': release.work_id, + }, + 'ext_ids': ext_ids, + 'expect_mimetypes': expect_mimetypes or None, + } + return ingest_request + |