From d06fd45e3c86cb080ad7724f3fc7575750a9cd69 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 15 Jan 2020 13:54:02 -0800 Subject: clarify ingest result schema and semantics --- proposals/2019_ingest.md | 57 +++++++++++++++++----------- python/sandcrawler/ingest.py | 15 ++++++-- python/tests/test_ingest.py | 22 +++++++++-- python/tests/test_live_wayback.py | 2 + sql/migrations/2019-12-19-060141_init/up.sql | 16 ++++++++ 5 files changed, 82 insertions(+), 30 deletions(-) diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md index 751532a..0b569b0 100644 --- a/proposals/2019_ingest.md +++ b/proposals/2019_ingest.md @@ -112,29 +112,40 @@ HTML? Or both? Let's just recrawl. - ... *FileIngestResult* - - request (object): the full IngestRequest, copied - - terminal - - url - - status_code - - wayback (XXX: ?) - - datetime - - archive_url - - file_meta (same schema as sandcrawler-db table) - - size_bytes - - md5 - - sha1 - - sha256 - - mimetype - - cdx (same schema as sandcrawler-db table) - - grobid (same schema as sandcrawler-db table) - - status - - grobid_version - - status_code - - xml_url - - fatcat_release (via biblio-glutton match) - - metadata (JSON) - - status (slug): 'success', 'error', etc - - hit (boolean): whether we got something that looks like what was requested + - `request` (object): the full IngestRequest, copied + - `status` (slug): 'success', 'error', etc + - `hit` (boolean): whether we got something that looks like what was requested + - `terminal` (object): last crawled resource (if any) + - `terminal_url` (string; formerly `url`) + - `terminal_dt` (string): wayback capture datetime (string) + - `terminal_status_code` + - `terminal_sha1hex`: should match true `file_meta` SHA1 (not necessarily CDX SHA1) + (in case of transport encoding difference) + - `file_meta` (object): info about the terminal file + - same schema as sandcrawler-db table + - `size_bytes` + - `md5hex` + - `sha1hex` + - `sha256hex` + - `mimetype`: if not know, `application/octet-stream` + - `cdx`: CDX record matching terminal resource. *MAY* be a revisit or partial + record (eg, if via SPNv2) + - same schema as sandcrawler-db table + - `revisit_cdx` (optional): if `cdx` is a revisit record, this will be the + best "original" location for retrieval of the body (matching `flie_meta`) + - same schema as sandcrawler-db table + - `grobid` + - same schema as sandcrawler-db table + - `status` (string) + - `status_code` (int) + - `grobid_version` (string, from metadata) + - `fatcat_release` (string, from metadata) + - `metadata` (JSON) (with `grobid_version` and `fatcat_release` removed) + - NOT `tei_xml` (strip from reply) + - NOT `file_meta` (strip from reply) + +In general, it is the `terminal_dt` and `terminal_url` that should be used to +construct wayback links (eg, for insertion to fatcat), not from the `cdx`. ## New SQL Tables diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index de5e957..53c4ccf 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -126,19 +126,21 @@ class IngestFileWorker(SandcrawlerWorker): assert result_row['hit'] existing_file_meta = self.pgrest_client.get_grobid(result_row['terminal_sha1hex']) existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex']) - if not (existing_file_meta and existing_grobid): + existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], result_row['terminal_dt']) + if not (existing_file_meta and existing_grobid and existing_cdx): raise NotImplementedError("partially-exsiting records not implemented yet") - # TODO: CDX result = { 'hit': result_row['hit'], 'status': "existing", 'request': request, 'grobid': existing_grobid, 'file_meta': existing_file_meta, + 'cdx': existing_cdx, 'terminal': { 'terminal_url': result_row['terminal_url'], 'terminal_dt': result_row['terminal_dt'], 'terminal_status_code': result_row['terminal_status_code'], + 'terminal_sha1hex': result_row['terminal_sha1hex'], }, } return result @@ -174,7 +176,9 @@ class IngestFileWorker(SandcrawlerWorker): if result['status'] == "success": metadata = self.grobid_client.metadata(result) if metadata: - result.update(metadata) + result['metadata'] = self.grobid_client.metadata(result) + result['fatcat_release'] = result['metadata'].pop('fatcat_release', None) + result['grobid_version'] = result['metadata'].pop('grobid_version', None) result.pop('tei_xml', None) result.pop('file_meta', None) result.pop('key', None) @@ -282,6 +286,7 @@ class IngestFileWorker(SandcrawlerWorker): "terminal_url": resource.terminal_url, "terminal_dt": resource.terminal_dt, "terminal_status_code": resource.terminal_status_code, + "terminal_sha1hex": file_meta['sha1hex'], } # fetch must be a hit if we got this far (though not necessarily an ingest hit!) @@ -300,7 +305,9 @@ class IngestFileWorker(SandcrawlerWorker): if not (resource.hit and file_meta['mimetype'] == "application/pdf"): # protocols.io PDFs are "application/octet-stream" - if not (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url): + if (file_meta['mimetype'] == "application/octet-stream" and "://protocols.io/" in resource.terminal_url): + pass + else: result['status'] = "wrong-mimetype" # formerly: "other-mimetype" return result diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 8f96a26..050e2ea 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -83,10 +83,22 @@ def test_ingest_success(ingest_worker_pdf): assert resp['hit'] == True assert resp['status'] == "success" assert resp['request'] == request - assert resp['file_meta']['size_bytes'] - assert resp['grobid'] + assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex'] + assert type(resp['terminal']['terminal_dt']) == str + assert resp['terminal']['terminal_url'] == TARGET + "/redirect" + assert resp['terminal']['terminal_status_code'] + assert type(resp['file_meta']['size_bytes']) == int + assert resp['file_meta']['mimetype'] == "application/pdf" + assert resp['cdx']['url'] == TARGET + "/redirect" + assert 'warc_path' not in resp['cdx'] + assert 'revisit_cdx' not in resp + assert resp['grobid']['status'] == "success" + assert resp['grobid']['status_code'] == 200 + assert resp['grobid']['grobid_version'] + assert 'fatcat_release' in resp['grobid'] + assert 'grobid_version' not in resp['grobid']['metadata'] + assert 'fatcat_release' not in resp['grobid']['metadata'] assert not 'tei_xml' in resp['grobid'] - assert resp['terminal'] @responses.activate def test_ingest_landing(ingest_worker): @@ -131,5 +143,9 @@ def test_ingest_landing(ingest_worker): assert resp['hit'] == False assert resp['status'] == "no-pdf-link" assert resp['request'] == request + assert 'terminal' in resp + assert 'file_meta' not in resp + assert 'cdx' not in resp + assert 'revisit_cdx' not in resp assert 'grobid' not in resp diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py index 4f7daef..429c6b0 100644 --- a/python/tests/test_live_wayback.py +++ b/python/tests/test_live_wayback.py @@ -132,6 +132,8 @@ def test_lookup_ftp(wayback_client): assert resp.terminal_url == url assert resp.terminal_status_code == 226 assert resp.cdx.url == url + assert resp.revisit_cdx + assert resp.revisit_cdx.url != url file_meta = gen_file_metadata(resp.body) assert file_meta['sha1hex'] == resp.cdx.sha1hex diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 12ed409..0b2b19c 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -1,10 +1,24 @@ +-- rows *may* be revisit records; indicated by mimetype == "warc/revisit" +-- records are implied to be 200 status (or 226 for ftp); either direct hits or +-- revisits +-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many +-- datetimes. import scripts should take efforts to reduce this sort of +-- duplication though. one row per *domain*/sha1hex pair is a good guideline. +-- all ingest result url/dt pairs should be included though. +-- any mimetype is allowed, but presumption should be that actual body is full +-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each +-- only a part of work). URLs that are parts of a fileset are allowed. CREATE TABLE IF NOT EXISTS cdx ( url TEXT NOT NULL CHECK (octet_length(url) >= 1), datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14), + -- sha1hex/cdx_sha1hex difference is intended to help with difference between + -- CDX hash (which is transport encoded body) vs. actual body. Probably need to + -- include both for all records? sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), mimetype TEXT CHECK (octet_length(mimetype) >= 1), + -- TODO: enforce that only paths with '/' (item+file) should be included? warc_path TEXT CHECK (octet_length(warc_path) >= 1), warc_csize BIGINT, warc_offset BIGINT, @@ -12,8 +26,10 @@ CREATE TABLE IF NOT EXISTS cdx ( PRIMARY KEY(url, datetime) ); CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); +-- TODO: remove this index? not currently used CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); +-- TODO: require all fields. if mimetype unknown, should be octet-stream CREATE TABLE IF NOT EXISTS file_meta ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), sha256hex TEXT CHECK (octet_length(sha256hex) = 64), -- cgit v1.2.3