update ingest proposal source/link naming

author: Bryan Newbold <bnewbold@archive.org> 2019-12-13 17:41:11 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2019-12-13 17:41:11 -0800
commit: 59776d4175faa3d0b7ff5f25456620b2a84d738e (patch)
tree: 74a5fc79f3a6a8bcb509836a367debb7ec7ebaf6
parent: 86cc7da8b01574587580c7539169cc726d7b4b3d (diff)
download: sandcrawler-59776d4175faa3d0b7ff5f25456620b2a84d738e.tar.gz
sandcrawler-59776d4175faa3d0b7ff5f25456620b2a84d738e.zip
2 files changed, 27 insertions, 17 deletions
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md
index a631811..4e4c8ed 100644
--- a/proposals/2019_ingest.md
+++ b/proposals/2019_ingest.md
@@ -82,24 +82,34 @@ NOTE: what about crawl requests where we don't know if we will get a PDF or
 HTML? Or both? Let's just recrawl.
 
 *IngestRequest*
-  - `ingest_type`: required, one of `pdf`, `xml`, `html`, `dataset`
+  - `ingest_type`: required, one of `pdf`, `xml`, `html`, `dataset`. For
+    backwards compatibility, `file` should be interpreted as `pdf`. `pdf` and
+    `xml` return file ingest respose; `html` and `dataset` not implemented but
+    would be webcapture (wayback) and fileset (archive.org item or wayback?).
+    In the future: `epub`, `video`, `git`, etc.
   - `base_url`: required, where to start crawl process
-  - `source`: recommended, slug string. indicating the database or "authority" where URL/identifier match is coming from (eg, `unpaywall`, `semantic-scholar`, `save-paper-now`, `doi`)
-  - `source_id`: recommended, slug string. to track where this ingest request is coming from
-  - `actor`: recommended, slug string. tracks the code or user who submitted request
+  - `link_source`: recommended, slug string. indicating the database or "authority"
+    where URL/identifier match is coming from (eg, `doi`, `pubmed`, `unpaywall`
+    (doi), `s2` (semantic-scholar id), `spn` (fatcat release), `core` (CORE
+    id), `mag` (MAG id))
+  - `link_source_id`: recommended, identifier string. pairs with `link_source`.
+  - `ingest_request_source`: recommended, slug string. tracks the service or
+    user who submitted request. eg, `fatcat-changelog`, `editor_<ident>`,
+    `savepapernow-web`
+  - `release_stage`: optional. indicates the release stage of fulltext expected to be found at this URL
   - `fatcat`
-    - `release_stage`: optional
-    - `release_ident`: optional
-    - `work_ident`: optional
+    - `release_ident`: optional. if provided, indicates that ingest is expected
+      to be fulltext copy of this release (though may be a sibling release
+      under same work if `release_stage` doesn't match)
+    - `work_ident`: optional, unused. might eventually be used if, eg,
+      `release_stage` of ingested file doesn't match that of the `release_ident`
     - `edit_extra`: additional metadata to be included in any eventual fatcat
-      commits. supplements project/source
-  - `ext_ids`
+      commits.
+  - `ext_ids`: matching fatcat schema. used for later lookups. sometimes
+    `link_source` and id are sufficient.
     - `doi`
     - `pmcid`
     - ...
-  - `expect_hash`: optional, if we are expecting a specific file
-    - `sha1`
-    - ...
 
 *FileIngestResult*
   - request (object): the full IngestRequest, copied
@@ -156,9 +166,8 @@ Proposing two tables:
         -- but we use this order for PRIMARY KEY so we have a free index on type/URL
         ingest_type             TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
         base_url                TEXT NOT NULL CHECK (octet_length(url) >= 1),
-        source                  TEXT NOT NULL CHECK (octet_length(source) >= 1),
-        source_id               TEXT NOT NULL CHECK (octet_length(source_id) >= 1),
-        actor                   TEXT NOT NULL CHECK (octet_length(actor) >= 1),
+        link_source             TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
+        link_source_id          TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
 
         created                 TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
         release_stage           TEXT CHECK (octet_length(release_stage) >= 1),
@@ -167,8 +176,9 @@ Proposing two tables:
         --   ext_ids (source/source_id sometimes enough)
         --   fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN)
         --   edit_extra
+        -- ingest_request_source   TEXT NOT NULL CHECK (octet_length(ingest_request_source) >= 1),
 
-        PRIMARY KEY (ingest_type, base_url, source, source_id)
+        PRIMARY KEY (ingest_type, base_url, link_source, link_source_id)
     );
 
     CREATE TABLE IF NOT EXISTS ingest_file_result (
diff --git a/python/ingest_file.py b/python/ingest_file.py
index 5a20aac..fcd2e94 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -12,7 +12,7 @@ def run_single_ingest(args):
     request = dict(
         base_url=args.url,
         ext_ids=dict(doi=args.doi),
-        release_id=args.release_id,
+        fatcat=dict(release_ident=args.release_id),
     )
     ingester = IngestFileWorker()
     result = ingester.process(request)
author	Bryan Newbold <bnewbold@archive.org>	2019-12-13 17:41:11 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2019-12-13 17:41:11 -0800
commit	59776d4175faa3d0b7ff5f25456620b2a84d738e (patch)
tree	74a5fc79f3a6a8bcb509836a367debb7ec7ebaf6
parent	86cc7da8b01574587580c7539169cc726d7b4b3d (diff)
download	sandcrawler-59776d4175faa3d0b7ff5f25456620b2a84d738e.tar.gz sandcrawler-59776d4175faa3d0b7ff5f25456620b2a84d738e.zip