diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-01 19:11:48 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-01 19:11:48 -0700 |
commit | ba3ee68b7789243921f0063461df1e2f7da65256 (patch) | |
tree | 3f09b98be4bbf3f645b2c806a33714b0ea8b5430 /python | |
parent | cc5f6f72294eee3f4310dcc38ba5b00d5d98fe69 (diff) | |
download | sandcrawler-ba3ee68b7789243921f0063461df1e2f7da65256.tar.gz sandcrawler-ba3ee68b7789243921f0063461df1e2f7da65256.zip |
allow through unknown-scope HTML ingests, for possible SPN import
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest.py | 16 |
1 files changed, 5 insertions, 11 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 7b7a191..c736878 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -389,16 +389,7 @@ class IngestFileWorker(SandcrawlerWorker): scope=html_scope, platform=html_platform, ) - elif html_scope == 'unknown': - html_body.pop("tei_xml", None) - return dict( - status="unknown-scope", - html_biblio=html_biblio_dict, - scope=html_scope, - platform=html_platform, - html_body=html_body, - ) - elif html_scope not in ('article-fulltext',): + elif html_scope not in ('article-fulltext','unknown',): html_body.pop("tei_xml", None) return dict( status="wrong-scope", @@ -461,13 +452,16 @@ class IngestFileWorker(SandcrawlerWorker): partial_result['error_message'] = str(e)[:1600] return partial_result - return dict( + info = dict( html_body=html_body, html_biblio=html_biblio_dict, scope=html_scope, platform=html_platform, html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources], ) + if html_scope == 'unknown': + info['status'] = 'unknown-scope' + return info def timeout_response(self, task: dict) -> dict: print("[TIMEOUT]", file=sys.stderr) |