From 8b6b1447cc37fb76865fd80377c55463e59db3b9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 5 Nov 2020 17:19:37 -0800
Subject: ingest: basic checks for ingest_type

---
 python/fatcat_tools/importers/ingest.py | 32 +++++++++++++++++++++++++++++---
 python/tests/files/example_ingest.json  |  2 +-
 python/tests/import_ingest.py           |  6 ++++++
 3 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 4b1d3702..c88ec86a 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -20,7 +20,7 @@ class IngestFileResultImporter(EntityImporter):
         assert self.default_link_rel
         self.require_grobid = require_grobid
         if self.require_grobid:
-            print("Requiring GROBID status == 200")
+            print("Requiring GROBID status == 200 (for PDFs)")
         else:
             print("NOT checking GROBID success")
         self.ingest_request_source_whitelist = [
@@ -74,8 +74,22 @@ class IngestFileResultImporter(EntityImporter):
         if not row.get('file_meta'):
             self.counts['skip-file-meta'] += 1
             return False
-        if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
-            self.counts['skip-grobid'] += 1
+
+        # type-specific filters
+        if row['request'].get('ingest_type') == 'pdf':
+            if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
+                self.counts['skip-grobid'] += 1
+                return False
+            if row['file_meta'].get('mimetype') not in ("application/pdf",):
+                self.counts['skip-mimetype'] += 1
+                return False
+        elif row['request'].get('ingest_type') == 'xml':
+            if row['file_meta'].get('mimetype') not in ("application/xml",
+                    "application/jats+xml", "application/tei+xml", "text/xml"):
+                self.counts['skip-mimetype'] += 1
+                return False
+        else:
+            self.counts['skip-ingest-type'] += 1
             return False
 
         return True
@@ -85,6 +99,18 @@ class IngestFileResultImporter(EntityImporter):
         request = row['request']
         fatcat = request.get('fatcat')
         file_meta = row['file_meta']
+    
+        # double check that want() filtered request correctly (eg, old requests)
+        if request.get('ingest_type') not in ('pdf', 'xml'):
+            self.counts['skip-ingest-type'] += 1
+            return None
+        assert (request['ingest_type'], file_meta['mimetype']) in [
+            ("pdf", "application/pdf"),
+            ("xml", "application/xml"),
+            ("xml", "application/jats+xml"),
+            ("xml", "application/tei+xml"),
+            ("xml", "text/xml"),
+        ]
 
         # identify release by fatcat ident, or extid lookup, or biblio-glutton match
         release_ident = None
diff --git a/python/tests/files/example_ingest.json b/python/tests/files/example_ingest.json
index cea67fa7..a9791587 100644
--- a/python/tests/files/example_ingest.json
+++ b/python/tests/files/example_ingest.json
@@ -1,2 +1,2 @@
-{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"ingest_request_source": "fatcat-changelog", "link_source": "doi", "link_source_id":"10.123/abc","ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 }, "hit": true, "status": "success"}
+{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"ingest_type": "pdf", "ingest_request_source": "fatcat-changelog", "link_source": "doi", "link_source_id":"10.123/abc","ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 }, "hit": true, "status": "success"}
 {"request":{"ingest_type":"pdf","ingest_request_source":"fatcat-changelog","base_url":"https://doi.org/10.3917/popav.748.0017","release_stage":"published","fatcat":{"release_ident":"weeqjkvsx5abze2bhithyrx6wu","work_ident":"ujatsk25yrdw5gofubw7nogzgq"},"ext_ids":{"doi":"10.3917/popav.748.0017"},"link_source":"doi","link_source_id":"10.3917/popav.748.0017"},"hit":false,"hops":["https://doi.org/10.3917/popav.748.0017"],"status":"wayback-error","error_message":"replay fetch didn't return X-Archive-Src in headers"}
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py
index 4a46232a..05287af4 100644
--- a/python/tests/import_ingest.py
+++ b/python/tests/import_ingest.py
@@ -94,6 +94,12 @@ def test_ingest_dict_parse(ingest_importer):
 def test_ingest_dict_parse_old(ingest_importer):
     with open('tests/files/example_ingest.old.json', 'r') as f:
         raw = json.loads(f.readline())
+
+        # ancient ingest requests had no type; skip them
+        f = ingest_importer.parse_record(raw)
+        assert f == None
+        raw['request']['ingest_type'] = 'pdf'
+
         f = ingest_importer.parse_record(raw)
         assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313"
         assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff"
-- 
cgit v1.2.3