diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-13 15:41:23 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-15 16:46:26 -0800 |
commit | c59ab02bb9c9e486c98e758f0098be09c1973b42 (patch) | |
tree | 40d690e8e8a42ac2f8ddec800d1f91b9aa3404e2 /python | |
parent | 31f237a6150f22676e93902a3597461ea954dc2c (diff) | |
download | fatcat-c59ab02bb9c9e486c98e758f0098be09c1973b42.tar.gz fatcat-c59ab02bb9c9e486c98e758f0098be09c1973b42.zip |
more ingest importer comments and counts
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 29 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 2 |
2 files changed, 29 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 9e75c26f..95df5efb 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -28,13 +28,33 @@ class IngestFileResultImporter(EntityImporter): print("Requiring GROBID status == 200") else: print("NOT checking GROBID success") + self.project_whitelist = ['fatcat-changelog'] def want(self, row): + """ + Logic here probably needs work: + + - Direct ingests via DOI from fatcat-changelog should probably go + through regardless of GROBID status + - We should filter/block things like single-page PDFs here + - public/anonymous submissions could require successful biblio-glutton + match, or some other sanity check on the fatcat side (eg, fuzzy title + match) + - handle the case of release_stage not being 'published'; if pre-print, + potentially create a new release. + + The current logic is intentionally conservative as a first step. + """ if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200: + self.counts['skip-grobid'] += 1 + return False + if self.project_whitelist and row.get('project') not in self.project_whitelist: + self.counts['skip-project'] += 1 return False if row.get('hit') == True and row.get('file_meta'): return True else: + self.counts['skip-hit'] += 1 return False def parse_record(self, row): @@ -43,7 +63,7 @@ class IngestFileResultImporter(EntityImporter): fatcat = request.get('fatcat') file_meta = row['file_meta'] - # identify release by fatcat ident or extid lookup + # identify release by fatcat ident, or extid lookup, or biblio-glutton match release_ident = None if fatcat and fatcat.get('release_ident'): release_ident = fatcat.get('release_ident') @@ -63,12 +83,19 @@ class IngestFileResultImporter(EntityImporter): continue release_ident = release.ident break + if not release and row.get('grobid'): + # try biblio-glutton extracted hit + if row['grobid'].get('fatcat_ident'): + release = row['grobid']['fatcat_ident'].split('_')[-1] if not release: self.counts['skip-release-not-found'] += 1 + return None cdx = row.get('cdx') if not cdx: + # TODO: support archive.org hits? + self.counts['skip-no-cdx'] += 1 return None url = make_rel_url(cdx['url'], self.default_link_rel) diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index eee60630..293bc5e6 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -45,7 +45,7 @@ def release_ingest_request(release, oa_only=False, project='fatcat'): if v: ext_ids[k] = v - if oa_only and not ext_ids['arxiv'] and not ext_ids['pmcid']: + if oa_only and not ext_ids.get('arxiv') and not ext_ids.get('pmcid'): es = release_to_elasticsearch(release) if not es['is_oa']: return None |