diff options
| -rw-r--r-- | python/sandcrawler/ingest.py | 16 | 
1 files changed, 5 insertions, 11 deletions
| diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 7b7a191..c736878 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -389,16 +389,7 @@ class IngestFileWorker(SandcrawlerWorker):                  scope=html_scope,                  platform=html_platform,              ) -        elif html_scope == 'unknown': -            html_body.pop("tei_xml", None) -            return dict( -                status="unknown-scope", -                html_biblio=html_biblio_dict, -                scope=html_scope, -                platform=html_platform, -                html_body=html_body, -            ) -        elif html_scope not in ('article-fulltext',): +        elif html_scope not in ('article-fulltext','unknown',):              html_body.pop("tei_xml", None)              return dict(                  status="wrong-scope", @@ -461,13 +452,16 @@ class IngestFileWorker(SandcrawlerWorker):              partial_result['error_message'] = str(e)[:1600]              return partial_result -        return dict( +        info = dict(              html_body=html_body,              html_biblio=html_biblio_dict,              scope=html_scope,              platform=html_platform,              html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],          ) +        if html_scope == 'unknown': +            info['status'] = 'unknown-scope' +        return info      def timeout_response(self, task: dict) -> dict:          print("[TIMEOUT]", file=sys.stderr) | 
