summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-21 17:19:58 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 17:19:58 -0700
commit90560ba18ac042a23db6995cc07a5cef024bc179 (patch)
tree64f38160b6b6efe26b58f33bb8d50f156119e7d6 /python/fatcat_tools/importers
parentc9c830256315066afdc619eeaba5b234de89468e (diff)
downloadfatcat-90560ba18ac042a23db6995cc07a5cef024bc179.tar.gz
fatcat-90560ba18ac042a23db6995cc07a5cef024bc179.zip
JALC bulk file importer
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py2
-rw-r--r--python/fatcat_tools/importers/common.py20
2 files changed, 21 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 6f8849d6..d0c6656c 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:
"""
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
from .jalc import JalcImporter
from .jstor import JstorImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 6e0c5caf..c0742914 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -568,6 +568,26 @@ class SqlitePusher(RecordPusher):
return counts
+class Bs4XmlLinesPusher(RecordPusher):
+
+ def __init__(self, importer, xml_file, prefix_filter=None, **kwargs):
+ self.importer = importer
+ self.xml_file = xml_file
+ self.prefix_filter = prefix_filter
+
+ def run(self):
+ for line in self.xml_file:
+ if not line:
+ continue
+ if self.prefix_filter and not line.startswith(self.prefix_filter):
+ continue
+ soup = BeautifulSoup(line, "xml")
+ self.importer.push_record(soup)
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
class Bs4XmlFilePusher(RecordPusher):
def __init__(self, importer, xml_file, record_tag, **kwargs):