diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-29 13:46:52 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-29 13:46:52 -0700 | 
| commit | ca956fc8b686adc3198eff58bbc8e32e13f9ec47 (patch) | |
| tree | 949010a4372043c8a0bc3f88f435b7a819fef52f /python | |
| parent | 86444ad33758563093c3614d2317af61eb825e7d (diff) | |
| download | fatcat-ca956fc8b686adc3198eff58bbc8e32e13f9ec47.tar.gz fatcat-ca956fc8b686adc3198eff58bbc8e32e13f9ec47.zip | |
faster LargeFile XML importer for PubMed
Diffstat (limited to 'python')
| -rwxr-xr-x | python/fatcat_import.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 50 | ||||
| -rw-r--r-- | python/tests/import_pubmed.py | 6 | 
4 files changed, 55 insertions, 5 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 21e40750..d76f706f 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -40,7 +40,7 @@ def run_pubmed(args):          raise NotImplementedError          #KafkaBs4XmlPusher(pi, args.kafka_hosts, args.kafka_env, "api-pubmed", "fatcat-import").run()      else: -        Bs4XmlFilePusher(pi, args.xml_file, "PubmedArticle").run() +        Bs4XmlLargeFilePusher(pi, args.xml_file, "PubmedArticle", record_list_tag="PubmedArticleSet").run()  def run_jstor(args):      ji = JstorImporter(args.api, diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 663b9812..7e23ca8c 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:  """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug  from .jalc import JalcImporter  from .jstor import JstorImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index dee60947..79425618 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -9,6 +9,8 @@ import itertools  import subprocess  import unicodedata  from collections import Counter +import xml.etree.ElementTree as ET +  import pykafka  from bs4 import BeautifulSoup @@ -616,12 +618,59 @@ class Bs4XmlFilePusher(RecordPusher):          soup = BeautifulSoup(self.xml_file, "xml")          for record in soup.find_all(self.record_tag):              self.importer.push_record(record) +            record.decompose()          counts = self.importer.finish()          soup.decompose()          print(counts)          return counts +class Bs4XmlLargeFilePusher(RecordPusher): +    """ +    This is a variant of Bs4XmlFilePusher which parses large files +    incrementally, instead of loading the whole thing in RAM first. + +    The dominant source of RAM utilization at start-up is the large ISSN/ISSN-L +    map. This can be confirmed in local development by using the small map in +    ./tests/files/. + +    Current implementation is weird/inefficient in that it re-parses with +    BeautifulSoup (lxml) every article, but I didn't want to mangle or re-write +    with a different BS back-end. + +    Did at least casual testing and all of: record.decompose(), +    soup.decompose(), element.clear(), root.clear() helped with memory usage. +    With all of these, memory growth is very slow and can probably be explained +    by inner container/release API lookup caches. +    """ + +    def __init__(self, importer, xml_file, record_tag, **kwargs): +        self.importer = importer +        self.xml_file = xml_file +        self.record_tag = record_tag + +    def run(self): +        elem_iter = ET.iterparse(self.xml_file, ["start", "end"]) +        i = 0 +        root = None +        for (event, element) in elem_iter: +            if not root and event == "start": +                root = element +                continue +            if not (element.tag == self.record_tag and event == "end"): +                continue +            soup = BeautifulSoup(ET.tostring(element), "xml") +            for record in soup.find_all(self.record_tag): +                self.importer.push_record(record) +                record.decompose() +            soup.decompose() +            element.clear() +            root.clear() +        counts = self.importer.finish() +        print(counts) +        return counts + +  class Bs4XmlFileListPusher(RecordPusher):      def __init__(self, importer, list_file, record_tag, **kwargs): @@ -638,6 +687,7 @@ class Bs4XmlFileListPusher(RecordPusher):                  soup = BeautifulSoup(xml_file, "xml")                  for record in soup.find_all(self.record_tag):                      self.importer.push_record(record) +                    record.decompose()                  soup.decompose()          counts = self.importer.finish()          print(counts) diff --git a/python/tests/import_pubmed.py b/python/tests/import_pubmed.py index 0185c8c4..0d551d02 100644 --- a/python/tests/import_pubmed.py +++ b/python/tests/import_pubmed.py @@ -1,7 +1,7 @@  import json, gzip  import pytest -from fatcat_tools.importers import PubmedImporter, Bs4XmlFilePusher +from fatcat_tools.importers import PubmedImporter, Bs4XmlLargeFilePusher  from fixtures import api  from bs4 import BeautifulSoup @@ -20,7 +20,7 @@ def test_pubmed_importer(pubmed_importer):      last_index = pubmed_importer.api.get_changelog(limit=1)[0].index      with open('tests/files/pubmedsample_2019.xml', 'r') as f:          pubmed_importer.bezerk_mode = True -        counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run() +        counts = Bs4XmlLargeFilePusher(pubmed_importer, f, "PubmedArticle").run()      assert counts['insert'] == 176      assert counts['exists'] == 0      assert counts['skip'] == 0 @@ -37,7 +37,7 @@ def test_pubmed_importer(pubmed_importer):      with open('tests/files/pubmedsample_2019.xml', 'r') as f:          pubmed_importer.bezerk_mode = False          pubmed_importer.reset() -        counts = Bs4XmlFilePusher(pubmed_importer, f, "PubmedArticle").run() +        counts = Bs4XmlLargeFilePusher(pubmed_importer, f, "PubmedArticle").run()      assert counts['insert'] == 0      assert counts['exists'] == 176      assert counts['skip'] == 0 | 
