From 9c1fd7cb8e60c397fa6defef2f0dc1eacc8d8aa7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 13 Dec 2019 17:43:27 -0800 Subject: update ingest request schema This is mostly changing ingest_type from 'file' to 'pdf', and adding 'link_source'/'link_source_id', plus some small cleanups. --- python/fatcat_tools/importers/ingest.py | 9 +++++++-- python/fatcat_tools/transforms/ingest.py | 27 ++++++++++++++++++++++----- python/fatcat_tools/workers/changelog.py | 2 +- python/fatcat_web/forms.py | 14 ++++++++------ python/fatcat_web/routes.py | 8 ++++++-- 5 files changed, 44 insertions(+), 16 deletions(-) diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index a4258a38..bd5713c3 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -132,10 +132,15 @@ class IngestFileResultImporter(EntityImporter): ) if fatcat and fatcat.get('edit_extra'): fe.edit_extra = fatcat['edit_extra'] + else: + fe.edit_extra = dict() if request.get('ingest_request_source'): - if not fe.edit_extra: - fe.edit_extra = dict() fe.edit_extra['ingest_request_source'] = request['ingest_request_source'] + if request.get('link_source') and request.get('link_source_id'): + fe.edit_extra['link_source'] = request['link_source'] + fe.edit_extra['link_source_id'] = request['link_source_id'] + if not fe.edit_extra: + fe.edit_extra = None return fe def try_update(self, fe): diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 988f80a2..e08d56b8 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,7 +1,7 @@ from .elasticsearch import release_to_elasticsearch -def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat', ingest_type='pdf'): +def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat', ingest_type=None): """ Takes a full release entity object and returns an ingest request (as dict), or None if it seems like this release shouldn't be ingested. @@ -12,7 +12,7 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat The 'oa_only' boolean flag indicates that we should only return an ingest request if we have reason to believe this is an OA release (or, eg, in arxiv or pubmed central). Respecting this flag means we are likely to miss - a lot of "hybrid" and "bronze" content, but could reduce load + a lot of "hybrid" and "bronze" content, but could reduce crawl load significantly. The type of the ingest request may depend on release type and container @@ -25,14 +25,22 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat # generate a URL where we expect to find fulltext url = None + link_source = None + link_source_id = None if release.ext_ids.arxiv: url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) + link_source = "arxiv" + link_source_id = release.ext_ids.arxiv elif release.ext_ids.doi: url = "https://doi.org/{}".format(release.ext_ids.doi) - elif release.ext_ids.pmcid: + link_source = "doi" + link_source_id = release.ext_ids.doi + elif release.ext_ids.pmcid and release.ext_ids.pmid: # TODO: how to tell if an author manuscript in PMC vs. published? #url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid) url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) + link_source = "pubmed" + link_source_id = release.ext_ids.pmid if not url: return None @@ -40,21 +48,30 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat ext_ids = release.ext_ids.to_dict() ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) - if oa_only and not ext_ids.get('arxiv') and not ext_ids.get('pmcid'): + if oa_only and link_source not in ('arxiv', 'pubmed'): es = release_to_elasticsearch(release) if not es['is_oa']: return None + # TODO: infer ingest type based on release_type or container metadata? + if not ingest_type: + ingest_type = 'pdf' + ingest_request = { 'ingest_type': ingest_type, 'ingest_request_source': ingest_request_source, 'base_url': url, + 'release_stage': release.release_stage, 'fatcat': { - 'release_stage': release.release_stage, 'release_ident': release.ident, 'work_ident': release.work_id, }, 'ext_ids': ext_ids, } + + if link_source and link_source_id: + ingest_request['link_source'] = link_source + ingest_request['link_source_id'] = link_source_id + return ingest_request diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index e1a72217..863ad40a 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -223,7 +223,7 @@ class EntityUpdatesWorker(FatcatWorker): # filter to "new" active releases with no matched files if release.ident in new_release_ids: ir = release_ingest_request(release, ingest_request_source='fatcat-changelog', oa_only=self.ingest_oa_only) - if ir and ir['ingest_type'] == 'file' and not release.files: + if ir and not release.files: producer.produce( self.ingest_file_request_topic, json.dumps(ir).encode('utf-8'), diff --git a/python/fatcat_web/forms.py b/python/fatcat_web/forms.py index bd4e4bbd..5539cc20 100644 --- a/python/fatcat_web/forms.py +++ b/python/fatcat_web/forms.py @@ -386,18 +386,16 @@ class SavePaperNowForm(FlaskForm): choices=release_stage_options, default='') - def to_ingest_request(self, release, actor='savepapernow-web'): + def to_ingest_request(self, release, ingest_request_source='savepapernow'): base_url = self.base_url.data ext_ids = release.ext_ids.to_dict() # by default this dict has a bunch of empty values ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) ingest_request = { 'ingest_type': self.ingest_type.data, - 'ingest_request_source': actor, # TODO: deprecate? - 'actor': actor, + 'ingest_request_source': ingest_request_source, 'base_url': base_url, 'fatcat': { - 'release_stage': release.release_stage, 'release_ident': release.ident, 'work_ident': release.work_id, }, @@ -405,8 +403,12 @@ class SavePaperNowForm(FlaskForm): } if self.release_stage.data: ingest_request['release_stage'] = self.release_stage.data + if release.ext_ids.doi and base_url == "https://doi.org/{}".format(release.ext_ids.doi): - ingest_request['source'] = 'doi' - ingest_request['source_id'] = release.ext_ids.doi + ingest_request['link_source'] = 'doi' + ingest_request['link_source_id'] = release.ext_ids.doi + elif release.ext_ids.arxiv and base_url == "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv): + ingest_request['link_source'] = 'arxiv' + ingest_request['link_source_id'] = release.ext_ids.arxiv return ingest_request diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py index cc0af5cc..8583d255 100644 --- a/python/fatcat_web/routes.py +++ b/python/fatcat_web/routes.py @@ -648,11 +648,11 @@ def release_save(ident): if form.is_submitted(): if form.validate_on_submit(): # got a valid spn request! try to send to kafka-pixy - msg = form.to_ingest_request(release) + msg = form.to_ingest_request(release, ingest_request_source="savepapernow-web") try: kafka_pixy_produce( Config.KAFKA_SAVEPAPERNOW_TOPIC, - json.dumps(msg), + json.dumps(msg, sort_keys=True), ) except Exception as e: print(e, file=sys.stderr) @@ -666,6 +666,10 @@ def release_save(ident): form.release_stage.data = release.release_stage if release.ext_ids.doi: form.base_url.data = "https://doi.org/{}".format(release.ext_ids.doi) + elif release.ext_ids.arxiv: + form.base_url.data = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv) + elif release.ext_ids.pmcid: + form.base_url.data = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(release.ext_ids.pmcid) return render_template('release_save.html', entity=release, form=form), 200 ### Search ################################################################## -- cgit v1.2.3