diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 20 | 
2 files changed, 21 insertions, 1 deletions
| diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index 6f8849d6..d0c6656c 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:  """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug  from .jalc import JalcImporter  from .jstor import JstorImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 6e0c5caf..c0742914 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -568,6 +568,26 @@ class SqlitePusher(RecordPusher):          return counts +class Bs4XmlLinesPusher(RecordPusher): + +    def __init__(self, importer, xml_file, prefix_filter=None, **kwargs): +        self.importer = importer +        self.xml_file = xml_file +        self.prefix_filter = prefix_filter + +    def run(self): +        for line in self.xml_file: +            if not line: +                continue +            if self.prefix_filter and not line.startswith(self.prefix_filter): +                continue +            soup = BeautifulSoup(line, "xml") +            self.importer.push_record(soup) +        counts = self.importer.finish() +        print(counts) +        return counts + +  class Bs4XmlFilePusher(RecordPusher):      def __init__(self, importer, xml_file, record_tag, **kwargs): | 
