From 41fae4c294e2ba43370b4a4193c0f6107201dbf0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Oct 2021 18:13:38 -0700 Subject: bugfix: setting html_biblio on ingest results This was caught during lint cleanup --- python/sandcrawler/ingest_file.py | 2 +- python/sandcrawler/ingest_fileset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index b3f2a8e..49c7ddf 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -682,7 +682,7 @@ class IngestFileWorker(SandcrawlerWorker): html_doc = HTMLParser(resource.body) html_biblio = html_extract_biblio(resource.terminal_url, html_doc) if html_biblio: - if not 'html_biblio' in result or html_biblio.title: + if 'html_biblio' not in result and html_biblio.title: result['html_biblio'] = json.loads( html_biblio.json(exclude_none=True)) #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index bf06a39..d88fb46 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -177,7 +177,7 @@ class IngestFilesetWorker(IngestFileWorker): html_doc = HTMLParser(resource.body) html_biblio = html_extract_biblio(resource.terminal_url, html_doc) if html_biblio: - if not 'html_biblio' in result or html_biblio.title: + if 'html_biblio' not in result and html_biblio.title: result['html_biblio'] = json.loads( html_biblio.json(exclude_none=True)) #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) -- cgit v1.2.3