filesets: iteration of implementation and docs

author: Bryan Newbold <bnewbold@archive.org> 2021-10-15 13:17:23 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-15 18:15:29 -0700
commit: 6cccac03451f46cb59897871e6631debca558771 (patch)
tree: e846a634d5a69584aab1902455dab1cf59f7b28c /proposals
parent: 84179e60f747070f7a2424e4deccaee2eb096605 (diff)
download: sandcrawler-6cccac03451f46cb59897871e6631debca558771.tar.gz
sandcrawler-6cccac03451f46cb59897871e6631debca558771.zip
1 files changed, 19 insertions, 14 deletions
diff --git a/proposals/2021-09-09_fileset_ingest.md b/proposals/2021-09-09_fileset_ingest.md
index bb9d358..82da7d7 100644
--- a/proposals/2021-09-09_fileset_ingest.md
+++ b/proposals/2021-09-09_fileset_ingest.md
@@ -121,14 +121,9 @@ New python types:
         ingest_strategy: str
         status: str
         manifest: List[FilesetManifestFile]
-
-    FilesetIngestResult
-        ingest_strategy: str
-        status: str
-        manifest: List[FilesetManifestFile]
-        single_file_meta: Optional[dict]
-        single_terminal: Optional[dict]
-        single_cdx: Optional[dict]
+        file_file_meta: Optional[dict]
+        file_terminal: Optional[dict]
+        file_cdx: Optional[dict]
         bundle_file_meta: Optional[dict]
         bundle_terminal: Optional[dict]
         bundle_cdx: Optional[dict]
@@ -160,6 +155,9 @@ New python APIs/classes:
   valid platform, which could be found via API or parsing, but has the wrong
   scope. Eg, tried to fetch a dataset, but got a DOI which represents all
   versions of the dataset, not a specific version.
+- `platform-restricted`/`PlatformRestrictedError`: for, eg, embargos
+- `platform-404`: got to a landing page, and seemed like in-scope, but no
+  platform record found anyways
 
 
 ## New Sandcrawler Code and Worker
@@ -216,11 +214,14 @@ Additional fileset-specific fields:
     platform_id: str
     ingest_strategy: str
     archiveorg_item_name: str (optional, only for `archiveorg-*` strategies)
+    file_count: int
+    total_size: int
     fileset_bundle (optional, only for `*-fileset-bundle` strategy)
-        archiveorg_bundle_path
         file_meta
         cdx
+        revisit_cdx
         terminal
+        archiveorg_bundle_path
     fileset_file (optional, only for `*-file` strategy)
         file_meta
         terminal
@@ -247,6 +248,9 @@ condition.
 
 ## New SQL Tables
 
+Note that this table *complements* `ingest_file_result`, doesn't replace it.
+`ingest_file_result` could more accurately be called `ingest_result`.
+
     CREATE TABLE IF NOT EXISTS ingest_fileset_platform (
         ingest_type             TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
         base_url                TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
@@ -254,9 +258,9 @@ condition.
         hit                     BOOLEAN NOT NULL,
         status                  TEXT CHECK (octet_length(status) >= 1),
 
-        platform_name           TEXT CHECK (octet_length(platform) >= 1),
-        platform_domain         TEXT CHECK (octet_length(platform_domain) >= 1),
-        platform_id             TEXT CHECK (octet_length(platform_id) >= 1),
+        platform_name           TEXT NOT NULL CHECK (octet_length(platform) >= 1),
+        platform_domain         TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1),
+        platform_id             TEXT NOT NULL CHECK (octet_length(platform_id) >= 1),
         ingest_strategy         TEXT CHECK (octet_length(ingest_strategy) >= 1),
         total_size              BIGINT,
         file_count              INT,
@@ -282,9 +286,10 @@ condition.
 
         PRIMARY KEY (ingest_type, base_url)
     );
-    CREATE INDEX ingest_fileset_result_terminal_url_idx ON ingest_fileset_result(terminal_url);
-    # TODO: index on (platform_name,platform_domain,platform_id) ?
+    CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id);
 
+Persist worker should only insert in to this table if `platform_name`,
+`platform_domain`, and `platform_id` are extracted successfully.
 
 ## New Kafka Topic
author	Bryan Newbold <bnewbold@archive.org>	2021-10-15 13:17:23 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-15 18:15:29 -0700
commit	6cccac03451f46cb59897871e6631debca558771 (patch)
tree	e846a634d5a69584aab1902455dab1cf59f7b28c /proposals
parent	84179e60f747070f7a2424e4deccaee2eb096605 (diff)
download	sandcrawler-6cccac03451f46cb59897871e6631debca558771.tar.gz sandcrawler-6cccac03451f46cb59897871e6631debca558771.zip