From d33c8cf05e3c9732b04f56cf356180b9d76e04e0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 22 May 2019 12:20:34 -0700 Subject: creative importer for bulk JSTOR imports --- python/fatcat_import.py | 18 ++++++++++++++++++ python/fatcat_tools/importers/__init__.py | 2 +- python/fatcat_tools/importers/common.py | 22 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 91fa2279..02a3441f 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -41,6 +41,12 @@ def run_pubmed(args): else: Bs4XmlFilePusher(pi, args.xml_file, "PubmedArticle").run() +def run_jstor(args): + ji = JstorImporter(args.api, + args.issn_map_file, + edit_batch_size=args.batch_size) + Bs4XmlFileListPusher(ji, args.list_file, "article").run() + def run_orcid(args): foi = OrcidImporter(args.api, edit_batch_size=args.batch_size) @@ -210,6 +216,18 @@ def main(): action='store_true', help="consume from kafka topic (not stdin)") + sub_jstor = subparsers.add_parser('jstor') + sub_jstor.set_defaults( + func=run_jstor, + auth_var="FATCAT_AUTH_WORKER_JSTOR", + ) + sub_jstor.add_argument('list_file', + help="List of JSTOR XML file paths to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_jstor.add_argument('issn_map_file', + help="ISSN to ISSN-L mapping file", + default=None, type=argparse.FileType('r')) + sub_orcid = subparsers.add_parser('orcid') sub_orcid.set_defaults( func=run_orcid, diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index d0c6656c..663b9812 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -12,7 +12,7 @@ To run an import you combine two classes; one each of: """ -from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC +from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug from .jalc import JalcImporter from .jstor import JstorImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 4d7b29fb..2a434693 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -606,6 +606,28 @@ class Bs4XmlFilePusher(RecordPusher): return counts +class Bs4XmlFileListPusher(RecordPusher): + + def __init__(self, importer, list_file, record_tag, **kwargs): + self.importer = importer + self.list_file = list_file + self.record_tag = record_tag + + def run(self): + for xml_path in self.list_file: + xml_path = xml_path.strip() + if not xml_path or xml_path.startswith("#"): + continue + with open(xml_path, 'r') as xml_file: + soup = BeautifulSoup(xml_file, "xml") + for record in soup.find_all(self.record_tag): + self.importer.push_record(record) + soup.decompose() + counts = self.importer.finish() + print(counts) + return counts + + class KafkaJsonPusher(RecordPusher): def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): -- cgit v1.2.3