summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-02 10:49:35 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-17 23:03:08 -0800
commit4e332e9037530ebc62836acfa78896dc76700c9c (patch)
treef4a7505dc233681f18220581b52b3b97064f3a00
parentc29183a78cb13d507b44bc57d392beae89250339 (diff)
downloadfatcat-4e332e9037530ebc62836acfa78896dc76700c9c.tar.gz
fatcat-4e332e9037530ebc62836acfa78896dc76700c9c.zip
add 'lxml' mode for large XML file import, and multi-tags
-rwxr-xr-xpython/fatcat_import.py3
-rw-r--r--python/fatcat_tools/importers/common.py43
-rw-r--r--python/tests/import_pubmed.py4
3 files changed, 31 insertions, 19 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index ff6c94dc..6c9e65a8 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -71,8 +71,7 @@ def run_pubmed(args):
Bs4XmlLargeFilePusher(
pi,
args.xml_file,
- "PubmedArticle",
- record_list_tag="PubmedArticleSet",
+ ["PubmedArticle"],
).run()
def run_jstor(args):
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 6dc2ab9e..2446cdbf 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -8,6 +8,9 @@ import datetime
import subprocess
from collections import Counter
from typing import Optional, Tuple
+from confluent_kafka import Consumer, KafkaException
+import lxml
+import xml.parsers.expat
import xml.etree.ElementTree as ET
import elasticsearch
@@ -528,7 +531,7 @@ class JsonLinePusher(RecordPusher):
record = json.loads(line)
self.importer.push_record(record)
counts = self.importer.finish()
- print(counts)
+ print(counts, file=sys.stderr)
return counts
@@ -544,7 +547,7 @@ class CsvPusher(RecordPusher):
continue
self.importer.push_record(line)
counts = self.importer.finish()
- print(counts)
+ print(counts, file=sys.stderr)
return counts
@@ -560,7 +563,7 @@ class LinePusher(RecordPusher):
continue
self.importer.push_record(line)
counts = self.importer.finish()
- print(counts)
+ print(counts, file=sys.stderr)
return counts
@@ -579,7 +582,7 @@ class SqlitePusher(RecordPusher):
for row in cur:
self.importer.push_record(row)
counts = self.importer.finish()
- print(counts)
+ print(counts, file=sys.stderr)
return counts
@@ -600,7 +603,7 @@ class Bs4XmlLinesPusher(RecordPusher):
self.importer.push_record(soup)
soup.decompose()
counts = self.importer.finish()
- print(counts)
+ print(counts, file=sys.stderr)
return counts
@@ -618,7 +621,7 @@ class Bs4XmlFilePusher(RecordPusher):
record.decompose()
counts = self.importer.finish()
soup.decompose()
- print(counts)
+ print(counts, file=sys.stderr)
return counts
@@ -641,29 +644,39 @@ class Bs4XmlLargeFilePusher(RecordPusher):
by inner container/release API lookup caches.
"""
- def __init__(self, importer, xml_file, record_tag, **kwargs):
+ def __init__(self, importer, xml_file, record_tags, use_lxml=False, **kwargs):
self.importer = importer
self.xml_file = xml_file
- self.record_tag = record_tag
+ self.record_tags = record_tags
+ self.use_lxml = use_lxml
def run(self):
- elem_iter = ET.iterparse(self.xml_file, ["start", "end"])
+ if self.use_lxml:
+ elem_iter = lxml.etree.iterparse(self.xml_file, ["start", "end"], load_dtd=True)
+ else:
+ elem_iter = ET.iterparse(self.xml_file, ["start", "end"])
root = None
for (event, element) in elem_iter:
- if not root and event == "start":
+ if (root is not None) and event == "start":
root = element
continue
- if not (element.tag == self.record_tag and event == "end"):
+ if not (element.tag in self.record_tags and event == "end"):
continue
- soup = BeautifulSoup(ET.tostring(element), "xml")
- for record in soup.find_all(self.record_tag):
+ if self.use_lxml:
+ soup = BeautifulSoup(lxml.etree.tostring(element), "xml")
+ else:
+ soup = BeautifulSoup(ET.tostring(element), "xml")
+ for record in soup.find_all():
+ if record.name not in self.record_tags:
+ continue
self.importer.push_record(record)
record.decompose()
soup.decompose()
element.clear()
- root.clear()
+ if root is not None:
+ root.clear()
counts = self.importer.finish()
- print(counts)
+ print(counts, file=sys.stderr)
return counts
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index 201f533c..10ded3fc 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -20,7 +20,7 @@ def test_pubmed_importer(pubmed_importer):
last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
with open('tests/files/pubmedsample_2019.xml', 'r') as f:
pubmed_importer.bezerk_mode = True
- counts = Bs4XmlLargeFilePusher(pubmed_importer, f, "PubmedArticle").run()
+ counts = Bs4XmlLargeFilePusher(pubmed_importer, f, ["PubmedArticle"]).run()
assert counts['insert'] == 176
assert counts['exists'] == 0
assert counts['skip'] == 0
@@ -37,7 +37,7 @@ def test_pubmed_importer(pubmed_importer):
with open('tests/files/pubmedsample_2019.xml', 'r') as f:
pubmed_importer.bezerk_mode = False
pubmed_importer.reset()
- counts = Bs4XmlLargeFilePusher(pubmed_importer, f, "PubmedArticle").run()
+ counts = Bs4XmlLargeFilePusher(pubmed_importer, f, ["PubmedArticle"]).run()
assert counts['insert'] == 0
assert counts['exists'] == 176
assert counts['skip'] == 0