diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-13 15:41:23 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-15 16:46:26 -0800 | 
| commit | c59ab02bb9c9e486c98e758f0098be09c1973b42 (patch) | |
| tree | 40d690e8e8a42ac2f8ddec800d1f91b9aa3404e2 /python/fatcat_tools/importers | |
| parent | 31f237a6150f22676e93902a3597461ea954dc2c (diff) | |
| download | fatcat-c59ab02bb9c9e486c98e758f0098be09c1973b42.tar.gz fatcat-c59ab02bb9c9e486c98e758f0098be09c1973b42.zip | |
more ingest importer comments and counts
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/ingest.py | 29 | 
1 files changed, 28 insertions, 1 deletions
| diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 9e75c26f..95df5efb 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -28,13 +28,33 @@ class IngestFileResultImporter(EntityImporter):              print("Requiring GROBID status == 200")          else:              print("NOT checking GROBID success") +        self.project_whitelist = ['fatcat-changelog']      def want(self, row): +        """ +        Logic here probably needs work: + +        - Direct ingests via DOI from fatcat-changelog should probably go +          through regardless of GROBID status +        - We should filter/block things like single-page PDFs here +        - public/anonymous submissions could require successful biblio-glutton +          match, or some other sanity check on the fatcat side (eg, fuzzy title +          match) +        - handle the case of release_stage not being 'published'; if pre-print, +          potentially create a new release. + +        The current logic is intentionally conservative as a first step. +        """          if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200: +            self.counts['skip-grobid'] += 1 +            return False +        if self.project_whitelist and row.get('project') not in self.project_whitelist: +            self.counts['skip-project'] += 1              return False          if row.get('hit') == True and row.get('file_meta'):              return True          else: +            self.counts['skip-hit'] += 1              return False      def parse_record(self, row): @@ -43,7 +63,7 @@ class IngestFileResultImporter(EntityImporter):          fatcat = request.get('fatcat')          file_meta = row['file_meta'] -        # identify release by fatcat ident or extid lookup +        # identify release by fatcat ident, or extid lookup, or biblio-glutton match          release_ident = None          if fatcat and fatcat.get('release_ident'):              release_ident = fatcat.get('release_ident') @@ -63,12 +83,19 @@ class IngestFileResultImporter(EntityImporter):                          continue                  release_ident = release.ident                  break +        if not release and row.get('grobid'): +            # try biblio-glutton extracted hit +            if row['grobid'].get('fatcat_ident'): +                release = row['grobid']['fatcat_ident'].split('_')[-1]          if not release:              self.counts['skip-release-not-found'] += 1 +            return None          cdx = row.get('cdx')          if not cdx: +            # TODO: support archive.org hits? +            self.counts['skip-no-cdx'] += 1              return None          url = make_rel_url(cdx['url'], self.default_link_rel) | 
