summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/transforms/ingest.py')
-rw-r--r--python/fatcat_tools/transforms/ingest.py64
1 files changed, 37 insertions, 27 deletions
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index 9101a4ec..30b5b190 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -1,4 +1,3 @@
-
INGEST_TYPE_CONTAINER_MAP = {
# Optica
"twtpsm6ytje3nhuqfu3pa7ca7u": "html",
@@ -14,7 +13,8 @@ INGEST_TYPE_CONTAINER_MAP = {
"lovwr7ladjagzkhmoaszg7efqu": "html",
}
-def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None):
+
+def release_ingest_request(release, ingest_request_source="fatcat", ingest_type=None):
"""
Takes a full release entity object and returns an ingest request (as dict),
or None if it seems like this release shouldn't be ingested.
@@ -27,27 +27,35 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
calling code should check the returned type field.
"""
- if release.state != 'active':
+ if release.state != "active":
return None
if (not ingest_type) and release.container_id:
ingest_type = INGEST_TYPE_CONTAINER_MAP.get(release.container_id)
if not ingest_type:
- if release.release_type == 'stub':
+ if release.release_type == "stub":
return None
- elif release.release_type in ['component', 'graphic']:
- ingest_type = 'component'
- elif release.release_type == 'dataset':
- ingest_type = 'dataset'
- elif release.release_type == 'software':
- ingest_type = 'software'
- elif release.release_type == 'post-weblog':
- ingest_type = 'html'
- elif release.release_type in ['article-journal', 'article', 'chapter', 'paper-conference', 'book', 'report', 'thesis']:
- ingest_type = 'pdf'
+ elif release.release_type in ["component", "graphic"]:
+ ingest_type = "component"
+ elif release.release_type == "dataset":
+ ingest_type = "dataset"
+ elif release.release_type == "software":
+ ingest_type = "software"
+ elif release.release_type == "post-weblog":
+ ingest_type = "html"
+ elif release.release_type in [
+ "article-journal",
+ "article",
+ "chapter",
+ "paper-conference",
+ "book",
+ "report",
+ "thesis",
+ ]:
+ ingest_type = "pdf"
else:
- ingest_type = 'pdf'
+ ingest_type = "pdf"
# generate a URL where we expect to find fulltext
url = None
@@ -59,8 +67,10 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
link_source_id = release.ext_ids.arxiv
elif release.ext_ids.pmcid and ingest_type == "pdf":
# TODO: how to tell if an author manuscript in PMC vs. published?
- #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
- url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid)
+ # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
+ url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(
+ release.ext_ids.pmcid
+ )
link_source = "pmc"
link_source_id = release.ext_ids.pmcid
elif release.ext_ids.doi:
@@ -75,19 +85,19 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v])
ingest_request = {
- 'ingest_type': ingest_type,
- 'ingest_request_source': ingest_request_source,
- 'base_url': url,
- 'release_stage': release.release_stage,
- 'fatcat': {
- 'release_ident': release.ident,
- 'work_ident': release.work_id,
+ "ingest_type": ingest_type,
+ "ingest_request_source": ingest_request_source,
+ "base_url": url,
+ "release_stage": release.release_stage,
+ "fatcat": {
+ "release_ident": release.ident,
+ "work_ident": release.work_id,
},
- 'ext_ids': ext_ids,
+ "ext_ids": ext_ids,
}
if link_source and link_source_id:
- ingest_request['link_source'] = link_source
- ingest_request['link_source_id'] = link_source_id
+ ingest_request["link_source"] = link_source
+ ingest_request["link_source_id"] = link_source_id
return ingest_request