diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-06 18:17:09 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-06 18:17:09 -0800 |
commit | 8f4a22d78acb6518c6546645557ad5f0d2253c66 (patch) | |
tree | 4f8a917fc3bf5ccb4a511a303a309374d9c72ea6 /python/sandcrawler/ingest.py | |
parent | 583f11aa95b3af5897d29f143f99716a257e9357 (diff) | |
download | sandcrawler-8f4a22d78acb6518c6546645557ad5f0d2253c66.tar.gz sandcrawler-8f4a22d78acb6518c6546645557ad5f0d2253c66.zip |
html: refactors/tweaks from testing
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r-- | python/sandcrawler/ingest.py | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index e0778d2..363dfb8 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -343,13 +343,14 @@ class IngestFileWorker(SandcrawlerWorker): html_biblio = html_extract_biblio(resource.terminal_url, html_doc) assert html_biblio html_body = html_extract_body_teixml(resource.body) - html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('tei_xml')) + html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count')) + html_biblio_dict = json.loads(html_biblio.json(exclude_none=True)) if html_scope not in ('article-fulltext', 'unknown'): html_body.pop("tei_xml", None) return dict( status="html-body-wrong-scope", - html_biblio=html_biblio, + html_biblio=html_biblio_dict, html_scope=html_scope, ) @@ -358,7 +359,7 @@ class IngestFileWorker(SandcrawlerWorker): html_body.pop("tei_xml", None) return dict( status="too-many-resources", - html_biblio=html_biblio, + html_biblio=html_biblio_dict, html_scope=html_scope, ) @@ -377,7 +378,7 @@ class IngestFileWorker(SandcrawlerWorker): return dict( html_body=html_body, - html_biblio=json.loads(html_biblio.json(exclude_none=True)), + html_biblio=html_biblio_dict, scope=html_scope, html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources], ) |