diff options
Diffstat (limited to 'python/fatcat_tools/transforms/ingest.py')
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 64 |
1 files changed, 37 insertions, 27 deletions
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 9101a4ec..30b5b190 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,4 +1,3 @@ - INGEST_TYPE_CONTAINER_MAP = { # Optica "twtpsm6ytje3nhuqfu3pa7ca7u": "html", @@ -14,7 +13,8 @@ INGEST_TYPE_CONTAINER_MAP = { "lovwr7ladjagzkhmoaszg7efqu": "html", } -def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None): + +def release_ingest_request(release, ingest_request_source="fatcat", ingest_type=None): """ Takes a full release entity object and returns an ingest request (as dict), or None if it seems like this release shouldn't be ingested. @@ -27,27 +27,35 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= calling code should check the returned type field. """ - if release.state != 'active': + if release.state != "active": return None if (not ingest_type) and release.container_id: ingest_type = INGEST_TYPE_CONTAINER_MAP.get(release.container_id) if not ingest_type: - if release.release_type == 'stub': + if release.release_type == "stub": return None - elif release.release_type in ['component', 'graphic']: - ingest_type = 'component' - elif release.release_type == 'dataset': - ingest_type = 'dataset' - elif release.release_type == 'software': - ingest_type = 'software' - elif release.release_type == 'post-weblog': - ingest_type = 'html' - elif release.release_type in ['article-journal', 'article', 'chapter', 'paper-conference', 'book', 'report', 'thesis']: - ingest_type = 'pdf' + elif release.release_type in ["component", "graphic"]: + ingest_type = "component" + elif release.release_type == "dataset": + ingest_type = "dataset" + elif release.release_type == "software": + ingest_type = "software" + elif release.release_type == "post-weblog": + ingest_type = "html" + elif release.release_type in [ + "article-journal", + "article", + "chapter", + "paper-conference", + "book", + "report", + "thesis", + ]: + ingest_type = "pdf" else: - ingest_type = 'pdf' + ingest_type = "pdf" # generate a URL where we expect to find fulltext url = None @@ -59,8 +67,10 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= link_source_id = release.ext_ids.arxiv elif release.ext_ids.pmcid and ingest_type == "pdf": # TODO: how to tell if an author manuscript in PMC vs. published? - #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) - url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) + # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) + url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format( + release.ext_ids.pmcid + ) link_source = "pmc" link_source_id = release.ext_ids.pmcid elif release.ext_ids.doi: @@ -75,19 +85,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type= ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) ingest_request = { - 'ingest_type': ingest_type, - 'ingest_request_source': ingest_request_source, - 'base_url': url, - 'release_stage': release.release_stage, - 'fatcat': { - 'release_ident': release.ident, - 'work_ident': release.work_id, + "ingest_type": ingest_type, + "ingest_request_source": ingest_request_source, + "base_url": url, + "release_stage": release.release_stage, + "fatcat": { + "release_ident": release.ident, + "work_ident": release.work_id, }, - 'ext_ids': ext_ids, + "ext_ids": ext_ids, } if link_source and link_source_id: - ingest_request['link_source'] = link_source - ingest_request['link_source_id'] = link_source_id + ingest_request["link_source"] = link_source + ingest_request["link_source_id"] = link_source_id return ingest_request |