From c59ab02bb9c9e486c98e758f0098be09c1973b42 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 13 Nov 2019 15:41:23 -0800 Subject: more ingest importer comments and counts --- python/fatcat_tools/importers/ingest.py | 29 ++++++++++++++++++++++++++++- python/fatcat_tools/transforms/ingest.py | 2 +- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 9e75c26f..95df5efb 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -28,13 +28,33 @@ class IngestFileResultImporter(EntityImporter): print("Requiring GROBID status == 200") else: print("NOT checking GROBID success") + self.project_whitelist = ['fatcat-changelog'] def want(self, row): + """ + Logic here probably needs work: + + - Direct ingests via DOI from fatcat-changelog should probably go + through regardless of GROBID status + - We should filter/block things like single-page PDFs here + - public/anonymous submissions could require successful biblio-glutton + match, or some other sanity check on the fatcat side (eg, fuzzy title + match) + - handle the case of release_stage not being 'published'; if pre-print, + potentially create a new release. + + The current logic is intentionally conservative as a first step. + """ if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200: + self.counts['skip-grobid'] += 1 + return False + if self.project_whitelist and row.get('project') not in self.project_whitelist: + self.counts['skip-project'] += 1 return False if row.get('hit') == True and row.get('file_meta'): return True else: + self.counts['skip-hit'] += 1 return False def parse_record(self, row): @@ -43,7 +63,7 @@ class IngestFileResultImporter(EntityImporter): fatcat = request.get('fatcat') file_meta = row['file_meta'] - # identify release by fatcat ident or extid lookup + # identify release by fatcat ident, or extid lookup, or biblio-glutton match release_ident = None if fatcat and fatcat.get('release_ident'): release_ident = fatcat.get('release_ident') @@ -63,12 +83,19 @@ class IngestFileResultImporter(EntityImporter): continue release_ident = release.ident break + if not release and row.get('grobid'): + # try biblio-glutton extracted hit + if row['grobid'].get('fatcat_ident'): + release = row['grobid']['fatcat_ident'].split('_')[-1] if not release: self.counts['skip-release-not-found'] += 1 + return None cdx = row.get('cdx') if not cdx: + # TODO: support archive.org hits? + self.counts['skip-no-cdx'] += 1 return None url = make_rel_url(cdx['url'], self.default_link_rel) diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index eee60630..293bc5e6 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -45,7 +45,7 @@ def release_ingest_request(release, oa_only=False, project='fatcat'): if v: ext_ids[k] = v - if oa_only and not ext_ids['arxiv'] and not ext_ids['pmcid']: + if oa_only and not ext_ids.get('arxiv') and not ext_ids.get('pmcid'): es = release_to_elasticsearch(release) if not es['is_oa']: return None -- cgit v1.2.3