aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-01 19:11:48 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-01 19:11:48 -0700
commitba3ee68b7789243921f0063461df1e2f7da65256 (patch)
tree3f09b98be4bbf3f645b2c806a33714b0ea8b5430 /python
parentcc5f6f72294eee3f4310dcc38ba5b00d5d98fe69 (diff)
downloadsandcrawler-ba3ee68b7789243921f0063461df1e2f7da65256.tar.gz
sandcrawler-ba3ee68b7789243921f0063461df1e2f7da65256.zip
allow through unknown-scope HTML ingests, for possible SPN import
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py16
1 files changed, 5 insertions, 11 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 7b7a191..c736878 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -389,16 +389,7 @@ class IngestFileWorker(SandcrawlerWorker):
scope=html_scope,
platform=html_platform,
)
- elif html_scope == 'unknown':
- html_body.pop("tei_xml", None)
- return dict(
- status="unknown-scope",
- html_biblio=html_biblio_dict,
- scope=html_scope,
- platform=html_platform,
- html_body=html_body,
- )
- elif html_scope not in ('article-fulltext',):
+ elif html_scope not in ('article-fulltext','unknown',):
html_body.pop("tei_xml", None)
return dict(
status="wrong-scope",
@@ -461,13 +452,16 @@ class IngestFileWorker(SandcrawlerWorker):
partial_result['error_message'] = str(e)[:1600]
return partial_result
- return dict(
+ info = dict(
html_body=html_body,
html_biblio=html_biblio_dict,
scope=html_scope,
platform=html_platform,
html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
)
+ if html_scope == 'unknown':
+ info['status'] = 'unknown-scope'
+ return info
def timeout_response(self, task: dict) -> dict:
print("[TIMEOUT]", file=sys.stderr)