faster LargeFile XML importer for PubMed

author: Bryan Newbold <bnewbold@robocracy.org> 2019-05-29 13:46:52 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-05-29 13:46:52 -0700
commit: ca956fc8b686adc3198eff58bbc8e32e13f9ec47 (patch)
tree: 949010a4372043c8a0bc3f88f435b7a819fef52f /python
parent: 86444ad33758563093c3614d2317af61eb825e7d (diff)
download: fatcat-ca956fc8b686adc3198eff58bbc8e32e13f9ec47.tar.gz
fatcat-ca956fc8b686adc3198eff58bbc8e32e13f9ec47.zip
4 files changed, 55 insertions, 5 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 21e40750..d76f706f 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -40,7 +40,7 @@ def run_pubmed(args):
         raise NotImplementedError
         #KafkaBs4XmlPusher(pi, args.kafka_hosts, args.kafka_env, "api-pubmed", "fatcat-import").run()
     else:
-        Bs4XmlFilePusher(pi, args.xml_file, "PubmedArticle").run()
+        Bs4XmlLargeFilePusher(pi, args.xml_file, "PubmedArticle", record_list_tag="PubmedArticleSet").run()
 
 def run_jstor(args):
     ji = JstorImporter(args.api,
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 663b9812..7e23ca8c 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:
 
 """
 
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
 from .jalc import JalcImporter
 from .jstor import JstorImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index dee60947..79425618 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -9,6 +9,8 @@ import itertools
 import subprocess
 import unicodedata
 from collections import Counter
+import xml.etree.ElementTree as ET
+
 import pykafka
 from bs4 import BeautifulSoup
 
@@ -616,12 +618,59 @@ class Bs4XmlFilePusher(RecordPusher):
         soup = BeautifulSoup(self.xml_file, "xml")
         for record in soup.find_all(self.record_tag):
             self.importer.push_record(record)
+            record.decompose()
         counts = self.importer.finish()
         soup.decompose()
         print(counts)
         return counts
 
 
+class Bs4XmlLargeFilePusher(RecordPusher):
+    """
+    This is a variant of Bs4XmlFilePusher which parses large files
+    incrementally, instead of loading the whole thing in RAM first.
+
+    The dominant source of RAM utilization at start-up is the large ISSN/ISSN-L
+    map. This can be confirmed in local development by using the small map in
+    ./tests/files/.
+
+    Current implementation is weird/inefficient in that it re-parses with
+    BeautifulSoup (lxml) every article, but I didn't want to mangle or re-write
+    with a different BS back-end.
+
+    Did at least casual testing and all of: record.decompose(),
+    soup.decompose(), element.clear(), root.clear() helped with memory usage.
+    With all of these, memory growth is very slow and can probably be explained
+    by inner container/release API lookup caches.
+    """
+
+    def __init__(self, importer, xml_file, record_tag, **kwargs):
+        self.importer = importer
+        self.xml_file = xml_file
+        self.record_tag = record_tag
+
+    def run(self):
+        elem_iter = ET.iterparse(self.xml_file, ["start", "end"])
+        i = 0
+        root = None
+        for (event, element) in elem_iter:
+            if not root and event == "start":
+                root = element
+                continue
+            if not (element.tag == self.record_tag and event == "end"):
+                continue
+            soup = BeautifulSoup(ET.tostring(element), "xml")
+            for record in soup.find_all(self.record_tag):
+                self.importer.push_record(record)
+                record.decompose()
+            soup.decompose()
+            element.clear()
+            root.clear()
+        counts = self.importer.finish()
+        print(counts)
+        return counts
+
+
 class Bs4XmlFileListPusher(RecordPusher):
 
     def __init__(self, importer, list_file, record_tag, **kwargs):
@@ -638,6 +687,7 @@ class Bs4XmlFileListPusher(RecordPusher):
                 soup = BeautifulSoup(xml_file, "xml")
                 for record in soup.find_all(self.record_tag):
                     self.importer.push_record(record)
+                    record.decompose()
                 soup.decompose()
         counts = self.importer.finish()
         print(counts)
diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py
index 0185c8c4..0d551d02 100644
--- a/python/tests/import_pubmed.py
+++ b/python/tests/import_pubmed.py
@@ -1,7 +1,7 @@
 
 import json, gzip
 import pytest
-from fatcat_tools.importers import PubmedImporter, Bs4XmlFilePusher
+from fatcat_tools.importers import PubmedImporter, Bs4XmlLargeFilePusher
 from fixtures import api
 from bs4 import BeautifulSoup
 
@@ -20,7 +20,7 @@ def test_pubmed_importer(pubmed_importer):
     last_index = pubmed_importer.api.get_changelog(limit=1)[0].index
     with open('tests/files/pubmedsample_2019.xml', 'r') as f:
         pubmed_importer.bezerk_mode = True
-        counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run()
+        counts = Bs4XmlLargeFilePusher(pubmed_importer, f, "PubmedArticle").run()
     assert counts['insert'] == 176
     assert counts['exists'] == 0
     assert counts['skip'] == 0
@@ -37,7 +37,7 @@ def test_pubmed_importer(pubmed_importer):
     with open('tests/files/pubmedsample_2019.xml', 'r') as f:
         pubmed_importer.bezerk_mode = False
         pubmed_importer.reset()
-        counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run()
+        counts = Bs4XmlLargeFilePusher(pubmed_importer, f, "PubmedArticle").run()
     assert counts['insert'] == 0
     assert counts['exists'] == 176
     assert counts['skip'] == 0
author	Bryan Newbold <bnewbold@robocracy.org>	2019-05-29 13:46:52 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-05-29 13:46:52 -0700
commit	ca956fc8b686adc3198eff58bbc8e32e13f9ec47 (patch)
tree	949010a4372043c8a0bc3f88f435b7a819fef52f /python
parent	86444ad33758563093c3614d2317af61eb825e7d (diff)
download	fatcat-ca956fc8b686adc3198eff58bbc8e32e13f9ec47.tar.gz fatcat-ca956fc8b686adc3198eff58bbc8e32e13f9ec47.zip