From ba3ee68b7789243921f0063461df1e2f7da65256 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 1 Oct 2021 19:11:48 -0700 Subject: allow through unknown-scope HTML ingests, for possible SPN import --- python/sandcrawler/ingest.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 7b7a191..c736878 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -389,16 +389,7 @@ class IngestFileWorker(SandcrawlerWorker): scope=html_scope, platform=html_platform, ) - elif html_scope == 'unknown': - html_body.pop("tei_xml", None) - return dict( - status="unknown-scope", - html_biblio=html_biblio_dict, - scope=html_scope, - platform=html_platform, - html_body=html_body, - ) - elif html_scope not in ('article-fulltext',): + elif html_scope not in ('article-fulltext','unknown',): html_body.pop("tei_xml", None) return dict( status="wrong-scope", @@ -461,13 +452,16 @@ class IngestFileWorker(SandcrawlerWorker): partial_result['error_message'] = str(e)[:1600] return partial_result - return dict( + info = dict( html_body=html_body, html_biblio=html_biblio_dict, scope=html_scope, platform=html_platform, html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources], ) + if html_scope == 'unknown': + info['status'] = 'unknown-scope' + return info def timeout_response(self, task: dict) -> dict: print("[TIMEOUT]", file=sys.stderr) -- cgit v1.2.3