From 832a9e42bc068c1b1656526b4a2cb7108c9b8334 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 18 Feb 2020 16:42:36 -0800 Subject: include rel and oa_status in ingest request 'extra' --- proposals/2019_ingest.md | 4 ++++ python/sandcrawler/db.py | 2 +- python/sandcrawler/persist.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md index 0b569b0..7c73ee3 100644 --- a/proposals/2019_ingest.md +++ b/proposals/2019_ingest.md @@ -97,6 +97,8 @@ HTML? Or both? Let's just recrawl. user who submitted request. eg, `fatcat-changelog`, `editor_`, `savepapernow-web` - `release_stage`: optional. indicates the release stage of fulltext expected to be found at this URL + - `rel`: optional. indicates the link type + - `oa_status`: optional. unpaywall schema - `fatcat` - `release_ident`: optional. if provided, indicates that ingest is expected to be fulltext copy of this release (though may be a sibling release @@ -186,6 +188,8 @@ Proposing two tables: -- ext_ids (source/source_id sometimes enough) -- release_ident (if ext_ids and source/source_id not specific enough; eg SPN) -- edit_extra + -- rel + -- oa_status -- ingest_request_source TEXT NOT NULL CHECK (octet_length(ingest_request_source) >= 1), PRIMARY KEY (ingest_type, base_url, link_source, link_source_id) diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index ddb71a0..673912c 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -248,7 +248,7 @@ class SandcrawlerPostgresClient: for r in batch: # in case these fields were already packed into 'request' extra = r.get('request', {}) - for k in ('ext_ids', 'fatcat_release', 'edit_extra'): + for k in ('ext_ids', 'fatcat_release', 'edit_extra', 'rel'): if r.get(k): extra[k] = r[k] if extra: diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index bfd8247..3f2762a 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -110,7 +110,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): request['release_stage'] = raw['release_stage'] if raw.get('fatcat', {}).get('release_ident'): request['request']['release_ident'] = raw['fatcat']['release_ident'] - for k in ('ext_ids', 'edit_extra'): + for k in ('ext_ids', 'edit_extra', 'rel'): if raw.get(k): request['request'][k] = raw[k] # if this dict is empty, trim it to save DB space -- cgit v1.2.3