summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/fatcat_tools/importers/ingest.py29
-rw-r--r--python/fatcat_tools/transforms/ingest.py2
2 files changed, 29 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 9e75c26f..95df5efb 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -28,13 +28,33 @@ class IngestFileResultImporter(EntityImporter):
print("Requiring GROBID status == 200")
else:
print("NOT checking GROBID success")
+ self.project_whitelist = ['fatcat-changelog']
def want(self, row):
+ """
+ Logic here probably needs work:
+
+ - Direct ingests via DOI from fatcat-changelog should probably go
+ through regardless of GROBID status
+ - We should filter/block things like single-page PDFs here
+ - public/anonymous submissions could require successful biblio-glutton
+ match, or some other sanity check on the fatcat side (eg, fuzzy title
+ match)
+ - handle the case of release_stage not being 'published'; if pre-print,
+ potentially create a new release.
+
+ The current logic is intentionally conservative as a first step.
+ """
if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200:
+ self.counts['skip-grobid'] += 1
+ return False
+ if self.project_whitelist and row.get('project') not in self.project_whitelist:
+ self.counts['skip-project'] += 1
return False
if row.get('hit') == True and row.get('file_meta'):
return True
else:
+ self.counts['skip-hit'] += 1
return False
def parse_record(self, row):
@@ -43,7 +63,7 @@ class IngestFileResultImporter(EntityImporter):
fatcat = request.get('fatcat')
file_meta = row['file_meta']
- # identify release by fatcat ident or extid lookup
+ # identify release by fatcat ident, or extid lookup, or biblio-glutton match
release_ident = None
if fatcat and fatcat.get('release_ident'):
release_ident = fatcat.get('release_ident')
@@ -63,12 +83,19 @@ class IngestFileResultImporter(EntityImporter):
continue
release_ident = release.ident
break
+ if not release and row.get('grobid'):
+ # try biblio-glutton extracted hit
+ if row['grobid'].get('fatcat_ident'):
+ release = row['grobid']['fatcat_ident'].split('_')[-1]
if not release:
self.counts['skip-release-not-found'] += 1
+ return None
cdx = row.get('cdx')
if not cdx:
+ # TODO: support archive.org hits?
+ self.counts['skip-no-cdx'] += 1
return None
url = make_rel_url(cdx['url'], self.default_link_rel)
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index eee60630..293bc5e6 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -45,7 +45,7 @@ def release_ingest_request(release, oa_only=False, project='fatcat'):
if v:
ext_ids[k] = v
- if oa_only and not ext_ids['arxiv'] and not ext_ids['pmcid']:
+ if oa_only and not ext_ids.get('arxiv') and not ext_ids.get('pmcid'):
es = release_to_elasticsearch(release)
if not es['is_oa']:
return None