From ca956fc8b686adc3198eff58bbc8e32e13f9ec47 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 May 2019 13:46:52 -0700 Subject: faster LargeFile XML importer for PubMed --- python/fatcat_tools/importers/__init__.py | 2 +- python/fatcat_tools/importers/common.py | 50 +++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 663b9812..7e23ca8c 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,7 +12,7 @@ To run an import you combine two classes; one each of: """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug from .jalc import JalcImporter from .jstor import JstorImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index dee60947..79425618 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -9,6 +9,8 @@ import itertools import subprocess import unicodedata from collections import Counter +import xml.etree.ElementTree as ET + import pykafka from bs4 import BeautifulSoup @@ -616,12 +618,59 @@ class Bs4XmlFilePusher(RecordPusher): soup = BeautifulSoup(self.xml_file, "xml") for record in soup.find_all(self.record_tag): self.importer.push_record(record) + record.decompose() counts = self.importer.finish() soup.decompose() print(counts) return counts +class Bs4XmlLargeFilePusher(RecordPusher): + """ + This is a variant of Bs4XmlFilePusher which parses large files + incrementally, instead of loading the whole thing in RAM first. + + The dominant source of RAM utilization at start-up is the large ISSN/ISSN-L + map. This can be confirmed in local development by using the small map in + ./tests/files/. + + Current implementation is weird/inefficient in that it re-parses with + BeautifulSoup (lxml) every article, but I didn't want to mangle or re-write + with a different BS back-end. + + Did at least casual testing and all of: record.decompose(), + soup.decompose(), element.clear(), root.clear() helped with memory usage. + With all of these, memory growth is very slow and can probably be explained + by inner container/release API lookup caches. + """ + + def __init__(self, importer, xml_file, record_tag, **kwargs): + self.importer = importer + self.xml_file = xml_file + self.record_tag = record_tag + + def run(self): + elem_iter = ET.iterparse(self.xml_file, ["start", "end"]) + i = 0 + root = None + for (event, element) in elem_iter: + if not root and event == "start": + root = element + continue + if not (element.tag == self.record_tag and event == "end"): + continue + soup = BeautifulSoup(ET.tostring(element), "xml") + for record in soup.find_all(self.record_tag): + self.importer.push_record(record) + record.decompose() + soup.decompose() + element.clear() + root.clear() + counts = self.importer.finish() + print(counts) + return counts + + class Bs4XmlFileListPusher(RecordPusher): def __init__(self, importer, list_file, record_tag, **kwargs): @@ -638,6 +687,7 @@ class Bs4XmlFileListPusher(RecordPusher): soup = BeautifulSoup(xml_file, "xml") for record in soup.find_all(self.record_tag): self.importer.push_record(record) + record.decompose() soup.decompose() counts = self.importer.finish() print(counts) -- cgit v1.2.3