clarify ingest result schema and semantics

author: Bryan Newbold <bnewbold@archive.org> 2020-01-15 13:54:02 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-15 13:54:02 -0800
commit: d06fd45e3c86cb080ad7724f3fc7575750a9cd69 (patch)
tree: 12862548aa14870af3c710076a6df8441f2ddb4e
parent: 4d0224f3e73315ef4db39643e6d4851e4a466658 (diff)
download: sandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.tar.gz
sandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.zip
5 files changed, 82 insertions, 30 deletions
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md
index 751532a..0b569b0 100644
--- a/proposals/2019_ingest.md
+++ b/proposals/2019_ingest.md
@@ -112,29 +112,40 @@ HTML? Or both? Let's just recrawl.
     - ...
 
 *FileIngestResult*
-  - request (object): the full IngestRequest, copied
-  - terminal
-    - url
-    - status_code
-  - wayback (XXX: ?)
-    - datetime
-    - archive_url
-  - file_meta (same schema as sandcrawler-db table)
-    - size_bytes
-    - md5
-    - sha1
-    - sha256
-    - mimetype
-  - cdx (same schema as sandcrawler-db table)
-  - grobid (same schema as sandcrawler-db table)
-    - status
-    - grobid_version
-    - status_code
-    - xml_url
-    - fatcat_release (via biblio-glutton match)
-    - metadata (JSON)
-  - status (slug): 'success', 'error', etc
-  - hit (boolean): whether we got something that looks like what was requested
+  - `request` (object): the full IngestRequest, copied
+  - `status` (slug): 'success', 'error', etc
+  - `hit` (boolean): whether we got something that looks like what was requested
+  - `terminal` (object): last crawled resource (if any)
+    - `terminal_url` (string; formerly `url`)
+    - `terminal_dt` (string): wayback capture datetime (string)
+    - `terminal_status_code`
+    - `terminal_sha1hex`: should match true `file_meta` SHA1 (not necessarily CDX SHA1)
+      (in case of transport encoding difference)
+  - `file_meta` (object): info about the terminal file
+    - same schema as sandcrawler-db table
+    - `size_bytes`
+    - `md5hex`
+    - `sha1hex`
+    - `sha256hex`
+    - `mimetype`: if not know, `application/octet-stream`
+  - `cdx`: CDX record matching terminal resource. *MAY* be a revisit or partial
+    record (eg, if via SPNv2)
+    - same schema as sandcrawler-db table 
+  - `revisit_cdx` (optional): if `cdx` is a revisit record, this will be the
+    best "original" location for retrieval of the body (matching `flie_meta`)
+    - same schema as sandcrawler-db table 
+  - `grobid`
+    - same schema as sandcrawler-db table
+    - `status` (string)
+    - `status_code` (int)
+    - `grobid_version` (string, from metadata)
+    - `fatcat_release` (string, from metadata)
+    - `metadata` (JSON) (with `grobid_version` and `fatcat_release` removed)
+    - NOT `tei_xml` (strip from reply)
+    - NOT `file_meta` (strip from reply)
+
+In general, it is the `terminal_dt` and `terminal_url` that should be used to
+construct wayback links (eg, for insertion to fatcat), not from the `cdx`.
 
 ## New SQL Tables
 
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index de5e957..53c4ccf 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -126,19 +126,21 @@ class IngestFileWorker(SandcrawlerWorker):
         assert result_row['hit']
         existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
         existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex'])
-        if not (existing_file_meta and existing_grobid):
+        existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt'])
+        if not (existing_file_meta and existing_grobid and existing_cdx):
             raise NotImplementedError("partially-exsiting records not implemented yet")
-        # TODO: CDX
         result = {
             'hit': result_row['hit'],
             'status': "existing",
             'request': request,
             'grobid': existing_grobid,
             'file_meta': existing_file_meta,
+            'cdx': existing_cdx,
             'terminal': {
                 'terminal_url': result_row['terminal_url'],
                 'terminal_dt': result_row['terminal_dt'],
                 'terminal_status_code': result_row['terminal_status_code'],
+                'terminal_sha1hex': result_row['terminal_sha1hex'],
             },
         }
         return result
@@ -174,7 +176,9 @@ class IngestFileWorker(SandcrawlerWorker):
         if result['status'] == "success":
             metadata = self.grobid_client.metadata(result)
             if metadata:
-                result.update(metadata)
+                result['metadata'] = self.grobid_client.metadata(result)
+                result['fatcat_release'] = result['metadata'].pop('fatcat_release', None)
+                result['grobid_version'] = result['metadata'].pop('grobid_version', None)
         result.pop('tei_xml', None)
         result.pop('file_meta', None)
         result.pop('key', None)
@@ -282,6 +286,7 @@ class IngestFileWorker(SandcrawlerWorker):
                 "terminal_url": resource.terminal_url,
                 "terminal_dt": resource.terminal_dt,
                 "terminal_status_code": resource.terminal_status_code,
+                "terminal_sha1hex": file_meta['sha1hex'],
             }
 
         # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
@@ -300,7 +305,9 @@ class IngestFileWorker(SandcrawlerWorker):
 
         if not (resource.hit and file_meta['mimetype'] == "application/pdf"):
             # protocols.io PDFs are "application/octet-stream"
-            if not (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url):
+            if (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url):
+                pass
+            else:
                 result['status'] = "wrong-mimetype"  # formerly: "other-mimetype"
                 return result
 
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 8f96a26..050e2ea 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -83,10 +83,22 @@ def test_ingest_success(ingest_worker_pdf):
     assert resp['hit'] == True
     assert resp['status'] == "success"
     assert resp['request'] == request
-    assert resp['file_meta']['size_bytes']
-    assert resp['grobid']
+    assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex']
+    assert type(resp['terminal']['terminal_dt']) == str
+    assert resp['terminal']['terminal_url'] == TARGET + "/redirect"
+    assert resp['terminal']['terminal_status_code']
+    assert type(resp['file_meta']['size_bytes']) == int
+    assert resp['file_meta']['mimetype'] == "application/pdf"
+    assert resp['cdx']['url'] == TARGET + "/redirect"
+    assert 'warc_path' not in resp['cdx']
+    assert 'revisit_cdx' not in resp
+    assert resp['grobid']['status'] == "success"
+    assert resp['grobid']['status_code'] == 200
+    assert resp['grobid']['grobid_version']
+    assert 'fatcat_release' in resp['grobid']
+    assert 'grobid_version' not in resp['grobid']['metadata']
+    assert 'fatcat_release' not in resp['grobid']['metadata']
     assert not 'tei_xml' in resp['grobid']
-    assert resp['terminal']
 
 @responses.activate
 def test_ingest_landing(ingest_worker):
@@ -131,5 +143,9 @@ def test_ingest_landing(ingest_worker):
     assert resp['hit'] == False
     assert resp['status'] == "no-pdf-link"
     assert resp['request'] == request
+    assert 'terminal' in resp
+    assert 'file_meta' not in resp
+    assert 'cdx' not in resp
+    assert 'revisit_cdx' not in resp
     assert 'grobid' not in resp
 
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 4f7daef..429c6b0 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -132,6 +132,8 @@ def test_lookup_ftp(wayback_client):
     assert resp.terminal_url == url
     assert resp.terminal_status_code == 226
     assert resp.cdx.url == url
+    assert resp.revisit_cdx
+    assert resp.revisit_cdx.url != url
 
     file_meta = gen_file_metadata(resp.body)
     assert file_meta['sha1hex'] == resp.cdx.sha1hex
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 12ed409..0b2b19c 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -1,10 +1,24 @@
 
+-- rows *may* be revisit records; indicated by mimetype == "warc/revisit"
+-- records are implied to be 200 status (or 226 for ftp); either direct hits or
+-- revisits
+-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many
+-- datetimes. import scripts should take efforts to reduce this sort of
+-- duplication though. one row per *domain*/sha1hex pair is a good guideline.
+-- all ingest result url/dt pairs should be included though.
+-- any mimetype is allowed, but presumption should be that actual body is full
+-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each
+-- only a part of work). URLs that are parts of a fileset are allowed.
 CREATE TABLE IF NOT EXISTS cdx (
     url                 TEXT NOT NULL CHECK (octet_length(url) >= 1),
     datetime            TEXT NOT NULL CHECK (octet_length(datetime) = 14),
+    -- sha1hex/cdx_sha1hex difference is intended to help with difference between
+    -- CDX hash (which is transport encoded body) vs. actual body. Probably need to
+    -- include both for all records?
     sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
     cdx_sha1hex         TEXT CHECK (octet_length(cdx_sha1hex) = 40),
     mimetype            TEXT CHECK (octet_length(mimetype) >= 1),
+    -- TODO: enforce that only paths with '/' (item+file) should be included?
     warc_path           TEXT CHECK (octet_length(warc_path) >= 1),
     warc_csize          BIGINT,
     warc_offset         BIGINT,
@@ -12,8 +26,10 @@ CREATE TABLE IF NOT EXISTS cdx (
     PRIMARY KEY(url, datetime)
 );
 CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
+-- TODO: remove this index? not currently used
 CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
 
+-- TODO: require all fields. if mimetype unknown, should be octet-stream
 CREATE TABLE IF NOT EXISTS file_meta (
     sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
     sha256hex           TEXT CHECK (octet_length(sha256hex) = 64),
author	Bryan Newbold <bnewbold@archive.org>	2020-01-15 13:54:02 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-15 13:54:02 -0800
commit	d06fd45e3c86cb080ad7724f3fc7575750a9cd69 (patch)
tree	12862548aa14870af3c710076a6df8441f2ddb4e
parent	4d0224f3e73315ef4db39643e6d4851e4a466658 (diff)
download	sandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.tar.gz sandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.zip