diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-11-05 17:19:37 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-11-05 17:19:37 -0800 |
commit | 8b6b1447cc37fb76865fd80377c55463e59db3b9 (patch) | |
tree | 42b2f26171db65e10d786f0bc45cccac79ae8b07 /python/fatcat_tools | |
parent | 5afde4690a4653db53fe4962af5da3eb9188d9a2 (diff) | |
download | fatcat-8b6b1447cc37fb76865fd80377c55463e59db3b9.tar.gz fatcat-8b6b1447cc37fb76865fd80377c55463e59db3b9.zip |
ingest: basic checks for ingest_type
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 32 |
1 files changed, 29 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 4b1d3702..c88ec86a 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -20,7 +20,7 @@ class IngestFileResultImporter(EntityImporter): assert self.default_link_rel self.require_grobid = require_grobid if self.require_grobid: - print("Requiring GROBID status == 200") + print("Requiring GROBID status == 200 (for PDFs)") else: print("NOT checking GROBID success") self.ingest_request_source_whitelist = [ @@ -74,8 +74,22 @@ class IngestFileResultImporter(EntityImporter): if not row.get('file_meta'): self.counts['skip-file-meta'] += 1 return False - if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: - self.counts['skip-grobid'] += 1 + + # type-specific filters + if row['request'].get('ingest_type') == 'pdf': + if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: + self.counts['skip-grobid'] += 1 + return False + if row['file_meta'].get('mimetype') not in ("application/pdf",): + self.counts['skip-mimetype'] += 1 + return False + elif row['request'].get('ingest_type') == 'xml': + if row['file_meta'].get('mimetype') not in ("application/xml", + "application/jats+xml", "application/tei+xml", "text/xml"): + self.counts['skip-mimetype'] += 1 + return False + else: + self.counts['skip-ingest-type'] += 1 return False return True @@ -85,6 +99,18 @@ class IngestFileResultImporter(EntityImporter): request = row['request'] fatcat = request.get('fatcat') file_meta = row['file_meta'] + + # double check that want() filtered request correctly (eg, old requests) + if request.get('ingest_type') not in ('pdf', 'xml'): + self.counts['skip-ingest-type'] += 1 + return None + assert (request['ingest_type'], file_meta['mimetype']) in [ + ("pdf", "application/pdf"), + ("xml", "application/xml"), + ("xml", "application/jats+xml"), + ("xml", "application/tei+xml"), + ("xml", "text/xml"), + ] # identify release by fatcat ident, or extid lookup, or biblio-glutton match release_ident = None |