summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-22 12:20:34 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-22 12:20:34 -0700
commitd33c8cf05e3c9732b04f56cf356180b9d76e04e0 (patch)
tree7c69bd1becae9d760b30c91012abcb159a62b73a /python/fatcat_tools/importers
parent4a3112f9f8de73511f354e7f1ceff3f8e2b7036d (diff)
downloadfatcat-d33c8cf05e3c9732b04f56cf356180b9d76e04e0.tar.gz
fatcat-d33c8cf05e3c9732b04f56cf356180b9d76e04e0.zip
creative importer for bulk JSTOR imports
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py2
-rw-r--r--python/fatcat_tools/importers/common.py22
2 files changed, 23 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index d0c6656c..663b9812 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:
"""
-from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
+from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
from .jalc import JalcImporter
from .jstor import JstorImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 4d7b29fb..2a434693 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -606,6 +606,28 @@ class Bs4XmlFilePusher(RecordPusher):
return counts
+class Bs4XmlFileListPusher(RecordPusher):
+
+ def __init__(self, importer, list_file, record_tag, **kwargs):
+ self.importer = importer
+ self.list_file = list_file
+ self.record_tag = record_tag
+
+ def run(self):
+ for xml_path in self.list_file:
+ xml_path = xml_path.strip()
+ if not xml_path or xml_path.startswith("#"):
+ continue
+ with open(xml_path, 'r') as xml_file:
+ soup = BeautifulSoup(xml_file, "xml")
+ for record in soup.find_all(self.record_tag):
+ self.importer.push_record(record)
+ soup.decompose()
+ counts = self.importer.finish()
+ print(counts)
+ return counts
+
+
class KafkaJsonPusher(RecordPusher):
def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs):