aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-04 18:10:00 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-04 18:10:00 -0800
commitde71aa92d4c7c9d14dfccc0188032d4e7b10090f (patch)
tree45e231fab99d4e5f576323dae8734ae71568c8f7 /python
parent2fdba24da0e0bf3d300cfb959514bf57a3cf6701 (diff)
downloadsandcrawler-de71aa92d4c7c9d14dfccc0188032d4e7b10090f.tar.gz
sandcrawler-de71aa92d4c7c9d14dfccc0188032d4e7b10090f.zip
html: actually publish HTML TEI-XML to body; fix dataflow though ingest a bit
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py30
-rwxr-xr-xpython/sandcrawler_worker.py6
2 files changed, 31 insertions, 5 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index cc64fa5..e0778d2 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -73,6 +73,7 @@ class IngestFileWorker(SandcrawlerWorker):
self.thumbnail_sink = kwargs.get('thumbnail_sink')
self.pdftext_sink = kwargs.get('pdftext_sink')
self.xmldoc_sink = kwargs.get('xmldoc_sink')
+ self.htmlteixml_sink = kwargs.get('htmlteixml_sink')
self.max_hops = 6
self.try_existing_ingest = kwargs.get('try_existing_ingest', False)
@@ -82,6 +83,7 @@ class IngestFileWorker(SandcrawlerWorker):
self.try_spn2 = kwargs.get('try_spn2', True)
self.html_quick_mode = False
self.adblock_rules = load_adblock_rules()
+ self.max_html_resources = 200
self.base_url_blocklist = [
# robot blocking
@@ -339,13 +341,26 @@ class IngestFileWorker(SandcrawlerWorker):
html_doc = HTMLParser(resource.body)
html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
+ assert html_biblio
html_body = html_extract_body_teixml(resource.body)
html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('tei_xml'))
- assert html_biblio
+ if html_scope not in ('article-fulltext', 'unknown'):
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="html-body-wrong-scope",
+ html_biblio=html_biblio,
+ html_scope=html_scope,
+ )
raw_resources = html_extract_resources(resource.terminal_url, html_doc, self.adblock_rules)
- assert len(raw_resources) <= 200
+ if len(raw_resources) > self.max_html_resources:
+ html_body.pop("tei_xml", None)
+ return dict(
+ status="too-many-resources",
+ html_biblio=html_biblio,
+ html_scope=html_scope,
+ )
when = parse_cdx_datetime(resource.cdx.datetime)
@@ -355,6 +370,11 @@ class IngestFileWorker(SandcrawlerWorker):
else:
full_resources = fetch_html_resources(raw_resources, self.wayback_client, when)
+ if self.htmlteixml_sink and html_body['status'] == "success":
+ self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex'])
+
+ html_body.pop("tei_xml", None)
+
return dict(
html_body=html_body,
html_biblio=json.loads(html_biblio.json(exclude_none=True)),
@@ -587,9 +607,9 @@ class IngestFileWorker(SandcrawlerWorker):
info = self.process_hit(ingest_type, resource, file_meta)
result.update(info)
- # scope is getting calculated in process_hit()
- if result.get('scope') and result['scope'] not in ('article-fulltext', 'unknown'):
- result['status'] = "wrong-scope"
+ # check if processing turned up an error
+ if info.get('status') not in ('success', None):
+ result['status'] = info['status']
return result
result['status'] = "success"
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 3681d7f..6be8bac 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -212,6 +212,7 @@ def run_ingest_file(args):
pdftext_topic = "sandcrawler-{}.pdf-text".format(args.env)
thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env)
xmldoc_topic = "sandcrawler-{}.xml-doc".format(args.env)
+ htmlteixml_topic = "sandcrawler-{}.html-teixml".format(args.env)
sink = KafkaSink(
kafka_hosts=args.kafka_hosts,
produce_topic=produce_topic,
@@ -235,6 +236,10 @@ def run_ingest_file(args):
kafka_hosts=args.kafka_hosts,
produce_topic=xmldoc_topic,
)
+ htmlteixml_sink = KafkaSink(
+ kafka_hosts=args.kafka_hosts,
+ produce_topic=htmlteixml_topic,
+ )
worker = IngestFileWorker(
grobid_client=grobid_client,
sink=sink,
@@ -242,6 +247,7 @@ def run_ingest_file(args):
thumbnail_sink=thumbnail_sink,
pdftext_sink=pdftext_sink,
xmldoc_sink=xmldoc_sink,
+ htmlteixml_sink=htmlteixml_sink,
# don't SPNv2 for --bulk backfill
try_spn2=not args.bulk,
)