diff options
Diffstat (limited to 'python/sandcrawler')
-rw-r--r-- | python/sandcrawler/ingest.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/xml.py | 7 |
2 files changed, 10 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 1a42b6a..363485e 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -18,6 +18,7 @@ from sandcrawler.html import extract_fulltext_url from sandcrawler.html_metadata import html_extract_fulltext_url, XML_FULLTEXT_PATTERNS from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient +from sandcrawler.xml import xml_reserialize class IngestFileWorker(SandcrawlerWorker): @@ -316,10 +317,11 @@ class IngestFileWorker(SandcrawlerWorker): count), or attempting to fetch sub-resources. """ if self.xmldoc_sink and file_meta['mimetype'] == "application/jats+xml": + jats_xml = xml_reserialize(resource.body) msg = dict( sha1hex=file_meta["sha1hex"], status="success", - jats_xml=resource.body.encode('utf-8'), + jats_xml=jats_xml, ) self.xmldoc_sink.push_record(msg, key=file_meta['sha1hex']) return dict(status="success") diff --git a/python/sandcrawler/xml.py b/python/sandcrawler/xml.py new file mode 100644 index 0000000..7a0086d --- /dev/null +++ b/python/sandcrawler/xml.py @@ -0,0 +1,7 @@ + +import xml.etree.ElementTree as ET + + +def xml_reserialize(raw: bytes) -> str: + root = ET.fromstring(raw) + return '<?xml version="1.0" encoding="UTF-8"?>\n' + ET.tostring(root, encoding="unicode") |