summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/common.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-29 13:46:52 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-29 13:46:52 -0700
commitca956fc8b686adc3198eff58bbc8e32e13f9ec47 (patch)
tree949010a4372043c8a0bc3f88f435b7a819fef52f /python/fatcat_tools/importers/common.py
parent86444ad33758563093c3614d2317af61eb825e7d (diff)
downloadfatcat-ca956fc8b686adc3198eff58bbc8e32e13f9ec47.tar.gz
fatcat-ca956fc8b686adc3198eff58bbc8e32e13f9ec47.zip
faster LargeFile XML importer for PubMed
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r--python/fatcat_tools/importers/common.py50
1 files changed, 50 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index dee60947..79425618 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -9,6 +9,8 @@ import itertools
import subprocess
import unicodedata
from collections import Counter
+import xml.etree.ElementTree as ET
+
import pykafka
from bs4 import BeautifulSoup
@@ -616,12 +618,59 @@ class Bs4XmlFilePusher(RecordPusher):
soup = BeautifulSoup(self.xml_file, "xml")
for record in soup.find_all(self.record_tag):
self.importer.push_record(record)
+ record.decompose()
counts = self.importer.finish()
soup.decompose()
print(counts)
return counts
+class Bs4XmlLargeFilePusher(RecordPusher):
+ """
+ This is a variant of Bs4XmlFilePusher which parses large files
+ incrementally, instead of loading the whole thing in RAM first.
+
+ The dominant source of RAM utilization at start-up is the large ISSN/ISSN-L
+ map. This can be confirmed in local development by using the small map in
+ ./tests/files/.
+
+ Current implementation is weird/inefficient in that it re-parses with
+ BeautifulSoup (lxml) every article, but I didn't want to mangle or re-write
+ with a different BS back-end.
+
+ Did at least casual testing and all of: record.decompose(),
+ soup.decompose(), element.clear(), root.clear() helped with memory usage.
+ With all of these, memory growth is very slow and can probably be explained
+ by inner container/release API lookup caches.
+ """
+
+ def __init__(self, importer, xml_file, record_tag, **kwargs):
+ self.importer = importer
+ self.xml_file = xml_file
+ self.record_tag = record_tag
+
+ def run(self):
+ elem_iter = ET.iterparse(self.xml_file, ["start", "end"])
+ i = 0
+ root = None
+ for (event, element) in elem_iter:
+ if not root and event == "start":
+ root = element
+ continue
+ if not (element.tag == self.record_tag and event == "end"):
+ continue
+ soup = BeautifulSoup(ET.tostring(element), "xml")
+ for record in soup.find_all(self.record_tag):
+ self.importer.push_record(record)
+ record.decompose()
+ soup.decompose()
+ element.clear()
+ root.clear()
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
class Bs4XmlFileListPusher(RecordPusher):
def __init__(self, importer, list_file, record_tag, **kwargs):
@@ -638,6 +687,7 @@ class Bs4XmlFileListPusher(RecordPusher):
soup = BeautifulSoup(xml_file, "xml")
for record in soup.find_all(self.record_tag):
self.importer.push_record(record)
+ record.decompose()
soup.decompose()
counts = self.importer.finish()
print(counts)