diff options
Diffstat (limited to 'python')
| -rwxr-xr-x | python/fatcat_import.py | 18 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 22 | 
3 files changed, 41 insertions, 1 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 91fa2279..02a3441f 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -41,6 +41,12 @@ def run_pubmed(args):      else:          Bs4XmlFilePusher(pi, args.xml_file, "PubmedArticle").run() +def run_jstor(args): +    ji = JstorImporter(args.api, +        args.issn_map_file, +        edit_batch_size=args.batch_size) +    Bs4XmlFileListPusher(ji, args.list_file, "article").run() +  def run_orcid(args):      foi = OrcidImporter(args.api,          edit_batch_size=args.batch_size) @@ -210,6 +216,18 @@ def main():          action='store_true',          help="consume from kafka topic (not stdin)") +    sub_jstor = subparsers.add_parser('jstor') +    sub_jstor.set_defaults( +        func=run_jstor, +        auth_var="FATCAT_AUTH_WORKER_JSTOR", +    ) +    sub_jstor.add_argument('list_file', +        help="List of JSTOR XML file paths to import from", +        default=sys.stdin, type=argparse.FileType('r')) +    sub_jstor.add_argument('issn_map_file', +        help="ISSN to ISSN-L mapping file", +        default=None, type=argparse.FileType('r')) +      sub_orcid = subparsers.add_parser('orcid')      sub_orcid.set_defaults(          func=run_orcid, diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index d0c6656c..663b9812 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,7 +12,7 @@ To run an import you combine two classes; one each of:  """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug  from .jalc import JalcImporter  from .jstor import JstorImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 4d7b29fb..2a434693 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -606,6 +606,28 @@ class Bs4XmlFilePusher(RecordPusher):          return counts +class Bs4XmlFileListPusher(RecordPusher): + +    def __init__(self, importer, list_file, record_tag, **kwargs): +        self.importer = importer +        self.list_file = list_file +        self.record_tag = record_tag + +    def run(self): +        for xml_path in self.list_file: +            xml_path = xml_path.strip() +            if not xml_path or xml_path.startswith("#"): +                continue +            with open(xml_path, 'r') as xml_file: +                soup = BeautifulSoup(xml_file, "xml") +                for record in soup.find_all(self.record_tag): +                    self.importer.push_record(record) +                soup.decompose() +        counts = self.importer.finish() +        print(counts) +        return counts + +  class KafkaJsonPusher(RecordPusher):      def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): | 
