aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py9
1 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index e0778d2..363dfb8 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -343,13 +343,14 @@ class IngestFileWorker(SandcrawlerWorker):
html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
assert html_biblio
html_body = html_extract_body_teixml(resource.body)
- html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('tei_xml'))
+ html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count'))
+ html_biblio_dict = json.loads(html_biblio.json(exclude_none=True))
if html_scope not in ('article-fulltext', 'unknown'):
html_body.pop("tei_xml", None)
return dict(
status="html-body-wrong-scope",
- html_biblio=html_biblio,
+ html_biblio=html_biblio_dict,
html_scope=html_scope,
)
@@ -358,7 +359,7 @@ class IngestFileWorker(SandcrawlerWorker):
html_body.pop("tei_xml", None)
return dict(
status="too-many-resources",
- html_biblio=html_biblio,
+ html_biblio=html_biblio_dict,
html_scope=html_scope,
)
@@ -377,7 +378,7 @@ class IngestFileWorker(SandcrawlerWorker):
return dict(
html_body=html_body,
- html_biblio=json.loads(html_biblio.json(exclude_none=True)),
+ html_biblio=html_biblio_dict,
scope=html_scope,
html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources],
)