From c59ab02bb9c9e486c98e758f0098be09c1973b42 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 13 Nov 2019 15:41:23 -0800
Subject: more ingest importer comments and counts

---
 python/fatcat_tools/importers/ingest.py  | 29 ++++++++++++++++++++++++++++-
 python/fatcat_tools/transforms/ingest.py |  2 +-
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 9e75c26f..95df5efb 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -28,13 +28,33 @@ class IngestFileResultImporter(EntityImporter):
             print("Requiring GROBID status == 200")
         else:
             print("NOT checking GROBID success")
+        self.project_whitelist = ['fatcat-changelog']
 
     def want(self, row):
+        """
+        Logic here probably needs work:
+
+        - Direct ingests via DOI from fatcat-changelog should probably go
+          through regardless of GROBID status
+        - We should filter/block things like single-page PDFs here
+        - public/anonymous submissions could require successful biblio-glutton
+          match, or some other sanity check on the fatcat side (eg, fuzzy title
+          match)
+        - handle the case of release_stage not being 'published'; if pre-print,
+          potentially create a new release.
+
+        The current logic is intentionally conservative as a first step.
+        """
         if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200:
+            self.counts['skip-grobid'] += 1
+            return False
+        if self.project_whitelist and row.get('project') not in self.project_whitelist:
+            self.counts['skip-project'] += 1
             return False
         if row.get('hit') == True and row.get('file_meta'):
             return True
         else:
+            self.counts['skip-hit'] += 1
             return False
 
     def parse_record(self, row):
@@ -43,7 +63,7 @@ class IngestFileResultImporter(EntityImporter):
         fatcat = request.get('fatcat')
         file_meta = row['file_meta']
 
-        # identify release by fatcat ident or extid lookup
+        # identify release by fatcat ident, or extid lookup, or biblio-glutton match
         release_ident = None
         if fatcat and fatcat.get('release_ident'):
             release_ident = fatcat.get('release_ident')
@@ -63,12 +83,19 @@ class IngestFileResultImporter(EntityImporter):
                         continue
                 release_ident = release.ident
                 break
+        if not release and row.get('grobid'):
+            # try biblio-glutton extracted hit
+            if row['grobid'].get('fatcat_ident'):
+                release = row['grobid']['fatcat_ident'].split('_')[-1]
 
         if not release:
             self.counts['skip-release-not-found'] += 1
+            return None
 
         cdx = row.get('cdx')
         if not cdx:
+            # TODO: support archive.org hits?
+            self.counts['skip-no-cdx'] += 1
             return None
 
         url = make_rel_url(cdx['url'], self.default_link_rel)
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index eee60630..293bc5e6 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -45,7 +45,7 @@ def release_ingest_request(release, oa_only=False, project='fatcat'):
         if v:
             ext_ids[k] = v
 
-    if oa_only and not ext_ids['arxiv'] and not ext_ids['pmcid']:
+    if oa_only and not ext_ids.get('arxiv') and not ext_ids.get('pmcid'):
         es = release_to_elasticsearch(release)
         if not es['is_oa']:
             return None
-- 
cgit v1.2.3