diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:09 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-02 18:14:09 -0700 |
commit | 6464631dbe5c4afeb76f2f3c9d63b89f917c9a3b (patch) | |
tree | 633303839cafc7d901cf8565e034542606a5bb27 /python | |
parent | cdfd6b85b386b7bbf9d5a5179ef26970b6e5a4e7 (diff) | |
download | fatcat-6464631dbe5c4afeb76f2f3c9d63b89f917c9a3b.tar.gz fatcat-6464631dbe5c4afeb76f2f3c9d63b89f917c9a3b.zip |
fmt (black): *.py
Diffstat (limited to 'python')
-rwxr-xr-x | python/fatcat_cleanup.py | 58 | ||||
-rwxr-xr-x | python/fatcat_export.py | 56 | ||||
-rwxr-xr-x | python/fatcat_harvest.py | 94 | ||||
-rwxr-xr-x | python/fatcat_import.py | 950 | ||||
-rwxr-xr-x | python/fatcat_ingest.py | 210 | ||||
-rwxr-xr-x | python/fatcat_review.py | 45 | ||||
-rwxr-xr-x | python/fatcat_transform.py | 172 | ||||
-rwxr-xr-x | python/fatcat_util.py | 48 | ||||
-rwxr-xr-x | python/fatcat_webface.py | 23 | ||||
-rwxr-xr-x | python/fatcat_worker.py | 139 | ||||
-rw-r--r-- | python/shell.py | 30 |
11 files changed, 1110 insertions, 715 deletions
diff --git a/python/fatcat_cleanup.py b/python/fatcat_cleanup.py index 04ee2bd9..f8030b16 100755 --- a/python/fatcat_cleanup.py +++ b/python/fatcat_cleanup.py @@ -15,38 +15,45 @@ sentry_client = raven.Client() def run_files(args): - fmi = FileCleaner(args.api, + fmi = FileCleaner( + args.api, dry_run_mode=args.dry_run, edit_batch_size=args.batch_size, - editgroup_description=args.editgroup_description_override) + editgroup_description=args.editgroup_description_override, + ) JsonLinePusher(fmi, args.json_file).run() + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--fatcat-api-url', - default="http://localhost:9411/v0", - help="connect to this host/port") - parser.add_argument('--batch-size', - help="size of batch to send", - default=50, type=int) - parser.add_argument('--editgroup-description-override', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.add_argument( + "--editgroup-description-override", help="editgroup description override", - default=None, type=str) - parser.add_argument('--dry-run', - help="dry-run mode (don't actually update)", - default=False, type=bool) + default=None, + type=str, + ) + parser.add_argument( + "--dry-run", help="dry-run mode (don't actually update)", default=False, type=bool + ) subparsers = parser.add_subparsers() - sub_files = subparsers.add_parser('files', - help="attempt metadata cleanups over a list of file entities") + sub_files = subparsers.add_parser( + "files", help="attempt metadata cleanups over a list of file entities" + ) sub_files.set_defaults( func=run_files, auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) - sub_files.add_argument('json_file', + sub_files.add_argument( + "json_file", help="files JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) + default=sys.stdin, + type=argparse.FileType("r"), + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -55,15 +62,18 @@ def main(): # allow editgroup description override via env variable (but CLI arg takes # precedence) - if not args.editgroup_description_override \ - and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'): - args.editgroup_description_override = os.environ.get('FATCAT_EDITGROUP_DESCRIPTION') + if not args.editgroup_description_override and os.environ.get( + "FATCAT_EDITGROUP_DESCRIPTION" + ): + args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION") args.api = authenticated_api( args.fatcat_api_url, # token is an optional kwarg (can be empty string, None, etc) - token=os.environ.get(args.auth_var)) + token=os.environ.get(args.auth_var), + ) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_export.py b/python/fatcat_export.py index 9a7cfd01..ebdc5af8 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -19,7 +19,9 @@ def run_export_releases(args): ident = uuid2fcid(line.split()[0]) release = args.api.get_release(ident=ident, expand="all") args.json_output.write( - json.dumps(entity_to_dict(release), api_client=args.api.api_client) + "\n") + json.dumps(entity_to_dict(release), api_client=args.api.api_client) + "\n" + ) + def run_export_changelog(args): end = args.end @@ -30,36 +32,47 @@ def run_export_changelog(args): for i in range(args.start, end): entry = args.api.get_changelog_entry(index=i) args.json_output.write( - json.dumps(entity_to_dict(entry, api_client=args.api.api_client)) + "\n") + json.dumps(entity_to_dict(entry, api_client=args.api.api_client)) + "\n" + ) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--fatcat-api-url', - default="http://localhost:9411/v0", - help="connect to this host/port") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) subparsers = parser.add_subparsers() - sub_releases = subparsers.add_parser('releases') + sub_releases = subparsers.add_parser("releases") sub_releases.set_defaults(func=run_export_releases) - sub_releases.add_argument('ident_file', + sub_releases.add_argument( + "ident_file", help="TSV list of fatcat release idents to dump", - default=sys.stdin, type=argparse.FileType('r')) - sub_releases.add_argument('json_output', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_releases.add_argument( + "json_output", help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) + default=sys.stdout, + type=argparse.FileType("w"), + ) - sub_changelog = subparsers.add_parser('changelog') + sub_changelog = subparsers.add_parser("changelog") sub_changelog.set_defaults(func=run_export_changelog) - sub_changelog.add_argument('--start', - help="index to start dumping at", - default=1, type=int) - sub_changelog.add_argument('--end', + sub_changelog.add_argument("--start", help="index to start dumping at", default=1, type=int) + sub_changelog.add_argument( + "--end", help="index to stop dumping at (else detect most recent)", - default=None, type=int) - sub_changelog.add_argument('json_output', + default=None, + type=int, + ) + sub_changelog.add_argument( + "json_output", help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) + default=sys.stdout, + type=argparse.FileType("w"), + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -69,5 +82,6 @@ def main(): args.api = public_api(args.fatcat_api_url) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py index 0324aa52..91356aad 100755 --- a/python/fatcat_harvest.py +++ b/python/fatcat_harvest.py @@ -26,9 +26,11 @@ def run_crossref(args): state_topic=f"fatcat-{args.env}.api-crossref-state", contact_email=args.contact_email, start_date=args.start_date, - end_date=args.end_date) + end_date=args.end_date, + ) worker.run(continuous=args.continuous) + def run_datacite(args): worker = HarvestDataciteWorker( kafka_hosts=args.kafka_hosts, @@ -36,93 +38,108 @@ def run_datacite(args): state_topic=f"fatcat-{args.env}.api-datacite-state", contact_email=args.contact_email, start_date=args.start_date, - end_date=args.end_date) + end_date=args.end_date, + ) worker.run(continuous=args.continuous) + def run_arxiv(args): worker = HarvestArxivWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.oaipmh-arxiv", state_topic=f"fatcat-{args.env}.oaipmh-arxiv-state", start_date=args.start_date, - end_date=args.end_date) + end_date=args.end_date, + ) worker.run(continuous=args.continuous) + def run_pubmed(args): worker = PubmedFTPWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.ftp-pubmed", state_topic=f"fatcat-{args.env}.ftp-pubmed-state", start_date=args.start_date, - end_date=args.end_date) + end_date=args.end_date, + ) worker.run(continuous=args.continuous) + def run_doaj_article(args): worker = HarvestDoajArticleWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.oaipmh-doaj-article", state_topic="fatcat-{args.env}.oaipmh-doaj-article-state", start_date=args.start_date, - end_date=args.end_date) + end_date=args.end_date, + ) worker.run(continuous=args.continuous) + def run_doaj_journal(args): worker = HarvestDoajJournalWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.oaipmh-doaj-journal", state_topic=f"fatcat-{args.env}.oaipmh-doaj-journal-state", start_date=args.start_date, - end_date=args.end_date) + end_date=args.end_date, + ) worker.run(continuous=args.continuous) def mkdate(raw): return datetime.datetime.strptime(raw, "%Y-%m-%d").date() + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--kafka-hosts', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--kafka-hosts", default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa, dev)") - parser.add_argument('--start-date', - default=None, type=mkdate, - help="beginning of harvest period") - parser.add_argument('--end-date', - default=None, type=mkdate, - help="end of harvest period") - parser.add_argument('--contact-email', - default="undefined", # better? - help="contact email to use in API header") - parser.add_argument('--continuous', - action='store_true', - help="continue harvesting indefinitely in a loop?") + help="list of Kafka brokers (host/port) to use", + ) + parser.add_argument( + "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)" + ) + parser.add_argument( + "--start-date", default=None, type=mkdate, help="beginning of harvest period" + ) + parser.add_argument("--end-date", default=None, type=mkdate, help="end of harvest period") + parser.add_argument( + "--contact-email", + default="undefined", # better? + help="contact email to use in API header", + ) + parser.add_argument( + "--continuous", action="store_true", help="continue harvesting indefinitely in a loop?" + ) subparsers = parser.add_subparsers() - sub_crossref = subparsers.add_parser('crossref', - help="harvest DOI metadata from Crossref API (JSON)") + sub_crossref = subparsers.add_parser( + "crossref", help="harvest DOI metadata from Crossref API (JSON)" + ) sub_crossref.set_defaults(func=run_crossref) - sub_datacite = subparsers.add_parser('datacite', - help="harvest DOI metadata from Datacite API (JSON)") + sub_datacite = subparsers.add_parser( + "datacite", help="harvest DOI metadata from Datacite API (JSON)" + ) sub_datacite.set_defaults(func=run_datacite) - sub_arxiv = subparsers.add_parser('arxiv', - help="harvest metadata from arxiv.org OAI-PMH endpoint (XML)") + sub_arxiv = subparsers.add_parser( + "arxiv", help="harvest metadata from arxiv.org OAI-PMH endpoint (XML)" + ) sub_arxiv.set_defaults(func=run_arxiv) - sub_pubmed = subparsers.add_parser('pubmed', - help="harvest MEDLINE/PubMed metadata from daily FTP updates (XML)") + sub_pubmed = subparsers.add_parser( + "pubmed", help="harvest MEDLINE/PubMed metadata from daily FTP updates (XML)" + ) sub_pubmed.set_defaults(func=run_pubmed) # DOAJ stuff disabled because API range-requests are broken - #sub_doaj_article = subparsers.add_parser('doaj-article') - #sub_doaj_article.set_defaults(func=run_doaj_article) - #sub_doaj_journal = subparsers.add_parser('doaj-journal') - #sub_doaj_journal.set_defaults(func=run_doaj_journal) + # sub_doaj_article = subparsers.add_parser('doaj-article') + # sub_doaj_article.set_defaults(func=run_doaj_article) + # sub_doaj_journal = subparsers.add_parser('doaj-journal') + # sub_doaj_journal.set_defaults(func=run_doaj_journal) args = parser.parse_args() if not args.__dict__.get("func"): @@ -130,5 +147,6 @@ def main(): sys.exit(-1) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 3225688c..42001974 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -51,11 +51,13 @@ sentry_client = raven.Client() def run_crossref(args): - fci = CrossrefImporter(args.api, + fci = CrossrefImporter( + args.api, args.issn_map_file, extid_map_file=args.extid_map_file, edit_batch_size=args.batch_size, - bezerk_mode=args.bezerk_mode) + bezerk_mode=args.bezerk_mode, + ) if args.kafka_mode: KafkaJsonPusher( fci, @@ -68,15 +70,14 @@ def run_crossref(args): else: JsonLinePusher(fci, args.json_file).run() + def run_jalc(args): - ji = JalcImporter(args.api, - args.issn_map_file, - extid_map_file=args.extid_map_file) + ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file) Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run() + def run_arxiv(args): - ari = ArxivRawImporter(args.api, - edit_batch_size=args.batch_size) + ari = ArxivRawImporter(args.api, edit_batch_size=args.batch_size) if args.kafka_mode: KafkaBs4XmlPusher( ari, @@ -87,15 +88,18 @@ def run_arxiv(args): ).run() else: if args.xml_file == sys.stdin: - print('note: reading from stdin', file=sys.stderr) + print("note: reading from stdin", file=sys.stderr) Bs4XmlFilePusher(ari, args.xml_file, "record").run() + def run_pubmed(args): - pi = PubmedImporter(args.api, + pi = PubmedImporter( + args.api, args.issn_map_file, edit_batch_size=args.batch_size, do_updates=args.do_updates, - lookup_refs=(not args.no_lookup_refs)) + lookup_refs=(not args.no_lookup_refs), + ) if args.kafka_mode: KafkaBs4XmlPusher( pi, @@ -111,62 +115,67 @@ def run_pubmed(args): ["PubmedArticle"], ).run() + def run_jstor(args): - ji = JstorImporter(args.api, - args.issn_map_file, - edit_batch_size=args.batch_size) + ji = JstorImporter(args.api, args.issn_map_file, edit_batch_size=args.batch_size) Bs4XmlFileListPusher(ji, args.list_file, "article").run() + def run_orcid(args): - foi = OrcidImporter(args.api, - edit_batch_size=args.batch_size) + foi = OrcidImporter(args.api, edit_batch_size=args.batch_size) JsonLinePusher(foi, args.json_file).run() + def run_journal_metadata(args): - fii = JournalMetadataImporter(args.api, - edit_batch_size=args.batch_size) + fii = JournalMetadataImporter(args.api, edit_batch_size=args.batch_size) JsonLinePusher(fii, args.json_file).run() + def run_chocula(args): - fii = ChoculaImporter(args.api, - edit_batch_size=args.batch_size, - do_updates=args.do_updates) + fii = ChoculaImporter(args.api, edit_batch_size=args.batch_size, do_updates=args.do_updates) JsonLinePusher(fii, args.json_file).run() + def run_matched(args): - fmi = MatchedImporter(args.api, + fmi = MatchedImporter( + args.api, edit_batch_size=args.batch_size, editgroup_description=args.editgroup_description_override, default_link_rel=args.default_link_rel, - default_mimetype=args.default_mimetype) + default_mimetype=args.default_mimetype, + ) JsonLinePusher(fmi, args.json_file).run() + def run_arabesque_match(args): - if (args.sqlite_file and args.json_file) or not (args.sqlite_file or - args.json_file): + if (args.sqlite_file and args.json_file) or not (args.sqlite_file or args.json_file): print("Supply one of --sqlite-file or --json-file") - ami = ArabesqueMatchImporter(args.api, + ami = ArabesqueMatchImporter( + args.api, editgroup_description=args.editgroup_description_override, do_updates=args.do_updates, require_grobid=(not args.no_require_grobid), extid_type=args.extid_type, crawl_id=args.crawl_id, default_link_rel=args.default_link_rel, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.sqlite_file: - SqlitePusher(ami, args.sqlite_file, "crawl_result", - ARABESQUE_MATCH_WHERE_CLAUSE).run() + SqlitePusher(ami, args.sqlite_file, "crawl_result", ARABESQUE_MATCH_WHERE_CLAUSE).run() elif args.json_file: JsonLinePusher(ami, args.json_file).run() + def run_ingest_file(args): - ifri = IngestFileResultImporter(args.api, + ifri = IngestFileResultImporter( + args.api, editgroup_description=args.editgroup_description_override, skip_source_allowlist=args.skip_source_allowlist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, require_grobid=(not args.no_require_grobid), - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -180,13 +189,16 @@ def run_ingest_file(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_ingest_web(args): - iwri = IngestWebResultImporter(args.api, + iwri = IngestWebResultImporter( + args.api, editgroup_description=args.editgroup_description_override, skip_source_allowlist=args.skip_source_allowlist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( iwri, @@ -201,13 +213,16 @@ def run_ingest_web(args): else: JsonLinePusher(iwri, args.json_file).run() + def run_ingest_fileset(args): - ifri = IngestFilesetResultImporter(args.api, + ifri = IngestFilesetResultImporter( + args.api, editgroup_description=args.editgroup_description_override, skip_source_allowlist=args.skip_source_allowlist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -222,10 +237,13 @@ def run_ingest_fileset(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_savepapernow_file(args): - ifri = SavePaperNowFileImporter(args.api, + ifri = SavePaperNowFileImporter( + args.api, editgroup_description=args.editgroup_description_override, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -240,10 +258,13 @@ def run_savepapernow_file(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_savepapernow_web(args): - ifri = SavePaperNowWebImporter(args.api, + ifri = SavePaperNowWebImporter( + args.api, editgroup_description=args.editgroup_description_override, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -258,10 +279,13 @@ def run_savepapernow_web(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_savepapernow_fileset(args): - ifri = SavePaperNowFilesetImporter(args.api, + ifri = SavePaperNowFilesetImporter( + args.api, editgroup_description=args.editgroup_description_override, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -276,18 +300,22 @@ def run_savepapernow_fileset(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_grobid_metadata(args): - fmi = GrobidMetadataImporter(args.api, + fmi = GrobidMetadataImporter( + args.api, edit_batch_size=args.batch_size, longtail_oa=args.longtail_oa, - bezerk_mode=args.bezerk_mode) + bezerk_mode=args.bezerk_mode, + ) LinePusher(fmi, args.tsv_file).run() + def run_shadow_lib(args): - fmi = ShadowLibraryImporter(args.api, - edit_batch_size=100) + fmi = ShadowLibraryImporter(args.api, edit_batch_size=100) JsonLinePusher(fmi, args.json_file).run() + def run_wayback_static(args): api = args.api @@ -295,8 +323,8 @@ def run_wayback_static(args): if args.release_id: release_id = args.release_id elif args.extid: - idtype = args.extid.split(':')[0] - extid = ':'.join(args.extid.split(':')[1:]) + idtype = args.extid.split(":")[0] + extid = ":".join(args.extid.split(":")[1:]) if idtype == "doi": release_id = api.lookup_release(doi=extid).ident elif idtype == "pmid": @@ -309,8 +337,9 @@ def run_wayback_static(args): raise Exception("need either release_id or extid argument") # create it - (editgroup_id, wc) = auto_wayback_static(api, release_id, args.wayback_url, - editgroup_id=args.editgroup_id) + (editgroup_id, wc) = auto_wayback_static( + api, release_id, args.wayback_url, editgroup_id=args.editgroup_id + ) if not wc: return print("release_id: {}".format(release_id)) @@ -318,12 +347,14 @@ def run_wayback_static(args): print("webcapture id: {}".format(wc.ident)) print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) + def run_cdl_dash_dat(args): api = args.api # create it - (editgroup_id, release, fs) = auto_cdl_dash_dat(api, args.dat_path, - release_id=args.release_id, editgroup_id=args.editgroup_id) + (editgroup_id, release, fs) = auto_cdl_dash_dat( + api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id + ) if not fs: return print("release_id: {}".format(release.ident)) @@ -331,14 +362,17 @@ def run_cdl_dash_dat(args): print("fileset id: {}".format(fs.ident)) print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) + def run_datacite(args): - dci = DataciteImporter(args.api, + dci = DataciteImporter( + args.api, args.issn_map_file, edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, debug=args.debug, extid_map_file=args.extid_map_file, - insert_log_file=args.insert_log_file) + insert_log_file=args.insert_log_file, + ) if args.kafka_mode: KafkaJsonPusher( dci, @@ -351,8 +385,10 @@ def run_datacite(args): else: JsonLinePusher(dci, args.json_file).run() + def run_doaj_article(args): - dai = DoajArticleImporter(args.api, + dai = DoajArticleImporter( + args.api, args.issn_map_file, edit_batch_size=args.batch_size, do_updates=args.do_updates, @@ -369,8 +405,10 @@ def run_doaj_article(args): else: JsonLinePusher(dai, args.json_file).run() + def run_dblp_release(args): - dri = DblpReleaseImporter(args.api, + dri = DblpReleaseImporter( + args.api, dblp_container_map_file=args.dblp_container_map_file, edit_batch_size=args.batch_size, do_updates=args.do_updates, @@ -383,8 +421,10 @@ def run_dblp_release(args): use_lxml=True, ).run() + def run_dblp_container(args): - dci = DblpContainerImporter(args.api, + dci = DblpContainerImporter( + args.api, args.issn_map_file, dblp_container_map_file=args.dblp_container_map_file, dblp_container_map_output=args.dblp_container_map_output, @@ -393,14 +433,17 @@ def run_dblp_container(args): ) JsonLinePusher(dci, args.json_file).run() + def run_file_meta(args): # do_updates defaults to true for this importer - fmi = FileMetaImporter(args.api, + fmi = FileMetaImporter( + args.api, edit_batch_size=100, editgroup_description=args.editgroup_description_override, ) JsonLinePusher(fmi, args.json_file).run() + def run_fileset(args): fmi = FilesetImporter( args.api, @@ -409,478 +452,664 @@ def run_fileset(args): ) JsonLinePusher(fmi, args.json_file).run() + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--host-url', - default="http://localhost:9411/v0", - help="connect to this host/port") - parser.add_argument('--kafka-hosts', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument( + "--kafka-hosts", default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--kafka-env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa)") - parser.add_argument('--batch-size', - help="size of batch to send", - default=50, type=int) - parser.add_argument('--editgroup-description-override', + help="list of Kafka brokers (host/port) to use", + ) + parser.add_argument( + "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa)" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.add_argument( + "--editgroup-description-override", help="editgroup description override", - default=None, type=str) + default=None, + type=str, + ) subparsers = parser.add_subparsers() - sub_crossref = subparsers.add_parser('crossref', - help="import Crossref API metadata format (JSON)") + sub_crossref = subparsers.add_parser( + "crossref", help="import Crossref API metadata format (JSON)" + ) sub_crossref.set_defaults( func=run_crossref, auth_var="FATCAT_AUTH_WORKER_CROSSREF", ) - sub_crossref.add_argument('json_file', + sub_crossref.add_argument( + "json_file", help="crossref JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_crossref.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_crossref.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_crossref.add_argument('--extid-map-file', + default=None, + type=argparse.FileType("r"), + ) + sub_crossref.add_argument( + "--extid-map-file", help="DOI-to-other-identifiers sqlite3 database", - default=None, type=str) - sub_crossref.add_argument('--no-lookup-refs', - action='store_true', - help="skip lookup of references (PMID or DOI)") - sub_crossref.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_crossref.add_argument('--bezerk-mode', - action='store_true', - help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") - - sub_jalc = subparsers.add_parser('jalc', - help="import JALC DOI metadata from XML dump") + default=None, + type=str, + ) + sub_crossref.add_argument( + "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)" + ) + sub_crossref.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_crossref.add_argument( + "--bezerk-mode", + action="store_true", + help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)", + ) + + sub_jalc = subparsers.add_parser("jalc", help="import JALC DOI metadata from XML dump") sub_jalc.set_defaults( func=run_jalc, auth_var="FATCAT_AUTH_WORKER_JALC", ) - sub_jalc.add_argument('xml_file', + sub_jalc.add_argument( + "xml_file", help="Jalc RDF XML file (record-per-line) to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_jalc.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_jalc.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_jalc.add_argument('--extid-map-file', + default=None, + type=argparse.FileType("r"), + ) + sub_jalc.add_argument( + "--extid-map-file", help="DOI-to-other-identifiers sqlite3 database", - default=None, type=str) + default=None, + type=str, + ) - sub_arxiv = subparsers.add_parser('arxiv', - help="import arxiv.org metadata from XML files") + sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( func=run_arxiv, auth_var="FATCAT_AUTH_WORKER_ARXIV", ) - sub_arxiv.add_argument('xml_file', - nargs='?', + sub_arxiv.add_argument( + "xml_file", + nargs="?", help="arXivRaw XML file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_arxiv.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_arxiv.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) - sub_pubmed = subparsers.add_parser('pubmed', - help="import MEDLINE/PubMed work-level metadata (XML)") + sub_pubmed = subparsers.add_parser( + "pubmed", help="import MEDLINE/PubMed work-level metadata (XML)" + ) sub_pubmed.set_defaults( func=run_pubmed, auth_var="FATCAT_AUTH_WORKER_PUBMED", ) - sub_pubmed.add_argument('xml_file', - nargs='?', + sub_pubmed.add_argument( + "xml_file", + nargs="?", help="Pubmed XML file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_pubmed.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_pubmed.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_pubmed.add_argument('--no-lookup-refs', - action='store_true', - help="skip lookup of references (PMID or DOI)") - sub_pubmed.add_argument('--do-updates', - action='store_true', - help="update pre-existing release entities") - sub_pubmed.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - - sub_jstor = subparsers.add_parser('jstor', - help="import JSTOR work-level metadata from XML dump") + default=None, + type=argparse.FileType("r"), + ) + sub_pubmed.add_argument( + "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)" + ) + sub_pubmed.add_argument( + "--do-updates", action="store_true", help="update pre-existing release entities" + ) + sub_pubmed.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + + sub_jstor = subparsers.add_parser( + "jstor", help="import JSTOR work-level metadata from XML dump" + ) sub_jstor.set_defaults( func=run_jstor, auth_var="FATCAT_AUTH_WORKER_JSTOR", ) - sub_jstor.add_argument('list_file', + sub_jstor.add_argument( + "list_file", help="List of JSTOR XML file paths to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_jstor.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_jstor.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) + default=None, + type=argparse.FileType("r"), + ) - sub_orcid = subparsers.add_parser('orcid', - help="import creator entities from ORCID XML dump") - sub_orcid.set_defaults( - func=run_orcid, - auth_var="FATCAT_AUTH_WORKER_ORCID" + sub_orcid = subparsers.add_parser( + "orcid", help="import creator entities from ORCID XML dump" ) - sub_orcid.add_argument('json_file', + sub_orcid.set_defaults(func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID") + sub_orcid.add_argument( + "json_file", help="orcid JSON file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub_journal_metadata = subparsers.add_parser('journal-metadata', - help="import/update container metadata from old manual munging format") + sub_journal_metadata = subparsers.add_parser( + "journal-metadata", + help="import/update container metadata from old manual munging format", + ) sub_journal_metadata.set_defaults( func=run_journal_metadata, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) - sub_journal_metadata.add_argument('json_file', + sub_journal_metadata.add_argument( + "json_file", help="Journal JSON metadata file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub_chocula = subparsers.add_parser('chocula', - help="import/update container metadata from chocula JSON export") + sub_chocula = subparsers.add_parser( + "chocula", help="import/update container metadata from chocula JSON export" + ) sub_chocula.set_defaults( func=run_chocula, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) - sub_chocula.add_argument('json_file', + sub_chocula.add_argument( + "json_file", help="chocula JSON entities file (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) - sub_chocula.add_argument('--do-updates', - action='store_true', - help="update pre-existing container entities") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_chocula.add_argument( + "--do-updates", action="store_true", help="update pre-existing container entities" + ) - sub_matched = subparsers.add_parser('matched', - help="add file entities matched against existing releases; custom JSON format") + sub_matched = subparsers.add_parser( + "matched", + help="add file entities matched against existing releases; custom JSON format", + ) sub_matched.set_defaults( func=run_matched, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_matched.add_argument('json_file', + sub_matched.add_argument( + "json_file", help="JSON file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) - sub_matched.add_argument('--default-mimetype', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_matched.add_argument( + "--default-mimetype", default=None, - help="default mimetype for imported files (if not specified per-file)") - sub_matched.add_argument('--bezerk-mode', - action='store_true', - help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") - sub_matched.add_argument('--default-link-rel', + help="default mimetype for imported files (if not specified per-file)", + ) + sub_matched.add_argument( + "--bezerk-mode", + action="store_true", + help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)", + ) + sub_matched.add_argument( + "--default-link-rel", default="web", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_arabesque_match = subparsers.add_parser('arabesque', - help="add file entities matched to releases from crawl log analysis") + sub_arabesque_match = subparsers.add_parser( + "arabesque", help="add file entities matched to releases from crawl log analysis" + ) sub_arabesque_match.set_defaults( func=run_arabesque_match, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) - sub_arabesque_match.add_argument('--sqlite-file', - help="sqlite database file to import from") - sub_arabesque_match.add_argument('--json-file', - help="JSON file to import from (or stdin)", - type=argparse.FileType('r')) - sub_arabesque_match.add_argument('--do-updates', - action='store_true', - help="update pre-existing file entities if new match (instead of skipping)") - sub_arabesque_match.add_argument('--no-require-grobid', - action='store_true', - help="whether postproc_status column must be '200'") - sub_arabesque_match.add_argument('--extid-type', + sub_arabesque_match.add_argument( + "--sqlite-file", help="sqlite database file to import from" + ) + sub_arabesque_match.add_argument( + "--json-file", help="JSON file to import from (or stdin)", type=argparse.FileType("r") + ) + sub_arabesque_match.add_argument( + "--do-updates", + action="store_true", + help="update pre-existing file entities if new match (instead of skipping)", + ) + sub_arabesque_match.add_argument( + "--no-require-grobid", + action="store_true", + help="whether postproc_status column must be '200'", + ) + sub_arabesque_match.add_argument( + "--extid-type", default="doi", - help="identifier type in the database (eg, 'doi', 'pmcid'") - sub_arabesque_match.add_argument('--crawl-id', - help="crawl ID (optionally included in editgroup metadata)") - sub_arabesque_match.add_argument('--default-link-rel', + help="identifier type in the database (eg, 'doi', 'pmcid'", + ) + sub_arabesque_match.add_argument( + "--crawl-id", help="crawl ID (optionally included in editgroup metadata)" + ) + sub_arabesque_match.add_argument( + "--default-link-rel", default="web", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_ingest_file = subparsers.add_parser('ingest-file-results', - help="add/update file entities linked to releases based on sandcrawler ingest results") + sub_ingest_file = subparsers.add_parser( + "ingest-file-results", + help="add/update file entities linked to releases based on sandcrawler ingest results", + ) sub_ingest_file.set_defaults( func=run_ingest_file, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) - sub_ingest_file.add_argument('json_file', + sub_ingest_file.add_argument( + "json_file", help="ingest_file JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_ingest_file.add_argument('--skip-source-allowlist', - action='store_true', - help="don't filter import based on request source allowlist") - sub_ingest_file.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_ingest_file.add_argument('--do-updates', - action='store_true', - help="update pre-existing file entities if new match (instead of skipping)") - sub_ingest_file.add_argument('--no-require-grobid', - action='store_true', - help="whether postproc_status column must be '200'") - sub_ingest_file.add_argument('--default-link-rel', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_ingest_file.add_argument( + "--skip-source-allowlist", + action="store_true", + help="don't filter import based on request source allowlist", + ) + sub_ingest_file.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_ingest_file.add_argument( + "--do-updates", + action="store_true", + help="update pre-existing file entities if new match (instead of skipping)", + ) + sub_ingest_file.add_argument( + "--no-require-grobid", + action="store_true", + help="whether postproc_status column must be '200'", + ) + sub_ingest_file.add_argument( + "--default-link-rel", default="web", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_ingest_web = subparsers.add_parser('ingest-web-results', - help="add/update web entities linked to releases based on sandcrawler ingest results") + sub_ingest_web = subparsers.add_parser( + "ingest-web-results", + help="add/update web entities linked to releases based on sandcrawler ingest results", + ) sub_ingest_web.set_defaults( func=run_ingest_web, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) - sub_ingest_web.add_argument('json_file', + sub_ingest_web.add_argument( + "json_file", help="ingest_web JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_ingest_web.add_argument('--skip-source-allowlist', - action='store_true', - help="don't filter import based on request source allowlist") - sub_ingest_web.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_ingest_web.add_argument('--do-updates', - action='store_true', - help="update pre-existing web entities if new match (instead of skipping)") - sub_ingest_web.add_argument('--default-link-rel', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_ingest_web.add_argument( + "--skip-source-allowlist", + action="store_true", + help="don't filter import based on request source allowlist", + ) + sub_ingest_web.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_ingest_web.add_argument( + "--do-updates", + action="store_true", + help="update pre-existing web entities if new match (instead of skipping)", + ) + sub_ingest_web.add_argument( + "--default-link-rel", default="web", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_ingest_fileset = subparsers.add_parser('ingest-fileset-results', - help="add/update fileset entities linked to releases based on sandcrawler ingest results") + sub_ingest_fileset = subparsers.add_parser( + "ingest-fileset-results", + help="add/update fileset entities linked to releases based on sandcrawler ingest results", + ) sub_ingest_fileset.set_defaults( func=run_ingest_fileset, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) - sub_ingest_fileset.add_argument('json_file', + sub_ingest_fileset.add_argument( + "json_file", help="ingest_fileset JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_ingest_fileset.add_argument('--skip-source-allowlist', - action='store_true', - help="don't filter import based on request source allowlist") - sub_ingest_fileset.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_ingest_fileset.add_argument('--do-updates', - action='store_true', - help="update pre-existing fileset entities if new match (instead of skipping)") - sub_ingest_fileset.add_argument('--default-link-rel', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_ingest_fileset.add_argument( + "--skip-source-allowlist", + action="store_true", + help="don't filter import based on request source allowlist", + ) + sub_ingest_fileset.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_ingest_fileset.add_argument( + "--do-updates", + action="store_true", + help="update pre-existing fileset entities if new match (instead of skipping)", + ) + sub_ingest_fileset.add_argument( + "--default-link-rel", default="fileset", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_savepapernow_file = subparsers.add_parser('savepapernow-file-results', - help="add file entities crawled due to async Save Paper Now request") + sub_savepapernow_file = subparsers.add_parser( + "savepapernow-file-results", + help="add file entities crawled due to async Save Paper Now request", + ) sub_savepapernow_file.set_defaults( func=run_savepapernow_file, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) - sub_savepapernow_file.add_argument('json_file', + sub_savepapernow_file.add_argument( + "json_file", help="ingest-file JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_savepapernow_file.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_savepapernow_file.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) - sub_savepapernow_web = subparsers.add_parser('savepapernow-web-results', - help="add webcapture entities crawled due to async Save Paper Now request") + sub_savepapernow_web = subparsers.add_parser( + "savepapernow-web-results", + help="add webcapture entities crawled due to async Save Paper Now request", + ) sub_savepapernow_web.set_defaults( func=run_savepapernow_web, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) - sub_savepapernow_web.add_argument('json_file', + sub_savepapernow_web.add_argument( + "json_file", help="ingest-file JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_savepapernow_web.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_savepapernow_web.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) - sub_savepapernow_fileset = subparsers.add_parser('savepapernow-fileset-results', - help="add fileset entities crawled due to async Save Paper Now request") + sub_savepapernow_fileset = subparsers.add_parser( + "savepapernow-fileset-results", + help="add fileset entities crawled due to async Save Paper Now request", + ) sub_savepapernow_fileset.set_defaults( func=run_savepapernow_fileset, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) - sub_savepapernow_fileset.add_argument('json_file', + sub_savepapernow_fileset.add_argument( + "json_file", help="ingest-file JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_savepapernow_fileset.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_savepapernow_fileset.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) - sub_grobid_metadata = subparsers.add_parser('grobid-metadata', - help="create release and file entities based on GROBID PDF metadata extraction") + sub_grobid_metadata = subparsers.add_parser( + "grobid-metadata", + help="create release and file entities based on GROBID PDF metadata extraction", + ) sub_grobid_metadata.set_defaults( func=run_grobid_metadata, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_grobid_metadata.add_argument('tsv_file', + sub_grobid_metadata.add_argument( + "tsv_file", help="TSV file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) - sub_grobid_metadata.add_argument('--group-size', - help="editgroup group size to use", - default=75, type=int) - sub_grobid_metadata.add_argument('--longtail-oa', - action='store_true', - help="if this is an import of longtail OA content (sets an 'extra' flag)") - sub_grobid_metadata.add_argument('--bezerk-mode', - action='store_true', - help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") - - sub_shadow_lib = subparsers.add_parser('shadow-lib', - help="create release and file entities based on GROBID PDF metadata extraction") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_grobid_metadata.add_argument( + "--group-size", help="editgroup group size to use", default=75, type=int + ) + sub_grobid_metadata.add_argument( + "--longtail-oa", + action="store_true", + help="if this is an import of longtail OA content (sets an 'extra' flag)", + ) + sub_grobid_metadata.add_argument( + "--bezerk-mode", + action="store_true", + help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)", + ) + + sub_shadow_lib = subparsers.add_parser( + "shadow-lib", + help="create release and file entities based on GROBID PDF metadata extraction", + ) sub_shadow_lib.set_defaults( func=run_shadow_lib, auth_var="FATCAT_AUTH_WORKER_SHADOW", ) - sub_shadow_lib.add_argument('json_file', + sub_shadow_lib.add_argument( + "json_file", help="JSON file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub_wayback_static = subparsers.add_parser('wayback-static', - help="crude crawl+ingest tool for single-page HTML docs from wayback") + sub_wayback_static = subparsers.add_parser( + "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback" + ) sub_wayback_static.set_defaults( func=run_wayback_static, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_wayback_static.add_argument('wayback_url', - type=str, - help="URL of wayback capture to extract from") - sub_wayback_static.add_argument('--extid', - type=str, - help="external identifier for release lookup") - sub_wayback_static.add_argument('--release-id', - type=str, - help="release entity identifier") - sub_wayback_static.add_argument('--editgroup-id', + sub_wayback_static.add_argument( + "wayback_url", type=str, help="URL of wayback capture to extract from" + ) + sub_wayback_static.add_argument( + "--extid", type=str, help="external identifier for release lookup" + ) + sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier") + sub_wayback_static.add_argument( + "--editgroup-id", type=str, - help="use existing editgroup (instead of creating a new one)") + help="use existing editgroup (instead of creating a new one)", + ) - sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat', - help="crude helper to import datasets from Dat/CDL mirror pilot project") + sub_cdl_dash_dat = subparsers.add_parser( + "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project" + ) sub_cdl_dash_dat.set_defaults( func=run_cdl_dash_dat, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_cdl_dash_dat.add_argument('dat_path', - type=str, - help="local path dat to import (must be the dat discovery key)") - sub_cdl_dash_dat.add_argument('--release-id', - type=str, - help="release entity identifier") - sub_cdl_dash_dat.add_argument('--editgroup-id', + sub_cdl_dash_dat.add_argument( + "dat_path", type=str, help="local path dat to import (must be the dat discovery key)" + ) + sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier") + sub_cdl_dash_dat.add_argument( + "--editgroup-id", type=str, - help="use existing editgroup (instead of creating a new one)") + help="use existing editgroup (instead of creating a new one)", + ) - sub_datacite = subparsers.add_parser('datacite', - help="import datacite.org metadata") - sub_datacite.add_argument('json_file', + sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata") + sub_datacite.add_argument( + "json_file", help="File with jsonlines from datacite.org v2 API to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_datacite.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_datacite.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_datacite.add_argument('--extid-map-file', + default=None, + type=argparse.FileType("r"), + ) + sub_datacite.add_argument( + "--extid-map-file", help="DOI-to-other-identifiers sqlite3 database", - default=None, type=str) - sub_datacite.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_datacite.add_argument('--bezerk-mode', - action='store_true', - help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") - sub_datacite.add_argument('--debug', - action='store_true', - help="write converted JSON to stdout") - sub_datacite.add_argument('--insert-log-file', - default='', + default=None, type=str, - help="write inserted documents into file (for debugging)") + ) + sub_datacite.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_datacite.add_argument( + "--bezerk-mode", + action="store_true", + help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)", + ) + sub_datacite.add_argument( + "--debug", action="store_true", help="write converted JSON to stdout" + ) + sub_datacite.add_argument( + "--insert-log-file", + default="", + type=str, + help="write inserted documents into file (for debugging)", + ) sub_datacite.set_defaults( func=run_datacite, auth_var="FATCAT_AUTH_WORKER_DATACITE", ) - sub_doaj_article = subparsers.add_parser('doaj-article', - help="import doaj.org article metadata") - sub_doaj_article.add_argument('json_file', + sub_doaj_article = subparsers.add_parser( + "doaj-article", help="import doaj.org article metadata" + ) + sub_doaj_article.add_argument( + "json_file", help="File with JSON lines from DOAJ API (or bulk dump) to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_doaj_article.add_argument('--issn-map-file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_doaj_article.add_argument( + "--issn-map-file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_doaj_article.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_doaj_article.add_argument('--do-updates', - action='store_true', - help="update any pre-existing release entities") + default=None, + type=argparse.FileType("r"), + ) + sub_doaj_article.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_doaj_article.add_argument( + "--do-updates", action="store_true", help="update any pre-existing release entities" + ) sub_doaj_article.set_defaults( func=run_doaj_article, auth_var="FATCAT_AUTH_WORKER_DOAJ", ) - sub_dblp_release = subparsers.add_parser('dblp-release', - help="import dblp release metadata") - sub_dblp_release.add_argument('xml_file', + sub_dblp_release = subparsers.add_parser( + "dblp-release", help="import dblp release metadata" + ) + sub_dblp_release.add_argument( + "xml_file", help="File with DBLP XML to import from", - default=sys.stdin, type=argparse.FileType('rb')) - sub_dblp_release.add_argument('--dblp-container-map-file', + default=sys.stdin, + type=argparse.FileType("rb"), + ) + sub_dblp_release.add_argument( + "--dblp-container-map-file", help="file path to dblp prefix to container_id TSV file", - default=None, type=argparse.FileType('r')) - sub_dblp_release.add_argument('--do-updates', - action='store_true', - help="update any pre-existing release entities") - sub_dblp_release.add_argument('--dump-json-mode', - action='store_true', - help="print release entities to stdout instead of importing") + default=None, + type=argparse.FileType("r"), + ) + sub_dblp_release.add_argument( + "--do-updates", action="store_true", help="update any pre-existing release entities" + ) + sub_dblp_release.add_argument( + "--dump-json-mode", + action="store_true", + help="print release entities to stdout instead of importing", + ) sub_dblp_release.set_defaults( func=run_dblp_release, auth_var="FATCAT_AUTH_WORKER_DBLP", ) - sub_dblp_container = subparsers.add_parser('dblp-container', - help="import dblp container metadata") - sub_dblp_container.add_argument('json_file', + sub_dblp_container = subparsers.add_parser( + "dblp-container", help="import dblp container metadata" + ) + sub_dblp_container.add_argument( + "json_file", help="File with DBLP container JSON to import from (see extra/dblp/)", - default=sys.stdin, type=argparse.FileType('rb')) - sub_dblp_container.add_argument('--dblp-container-map-file', + default=sys.stdin, + type=argparse.FileType("rb"), + ) + sub_dblp_container.add_argument( + "--dblp-container-map-file", help="file path to dblp pre-existing prefix to container_id TSV file", - default=None, type=argparse.FileType('r')) - sub_dblp_container.add_argument('--dblp-container-map-output', + default=None, + type=argparse.FileType("r"), + ) + sub_dblp_container.add_argument( + "--dblp-container-map-output", help="file path to output new dblp container map TSV to", - default=None, type=argparse.FileType('w')) - sub_dblp_container.add_argument('--issn-map-file', + default=None, + type=argparse.FileType("w"), + ) + sub_dblp_container.add_argument( + "--issn-map-file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_dblp_container.add_argument('--do-updates', - action='store_true', - help="update any pre-existing container entities") + default=None, + type=argparse.FileType("r"), + ) + sub_dblp_container.add_argument( + "--do-updates", action="store_true", help="update any pre-existing container entities" + ) sub_dblp_container.set_defaults( func=run_dblp_container, auth_var="FATCAT_AUTH_WORKER_DBLP", ) - sub_file_meta = subparsers.add_parser('file-meta', - help="simple update-only importer for file metadata") + sub_file_meta = subparsers.add_parser( + "file-meta", help="simple update-only importer for file metadata" + ) sub_file_meta.set_defaults( func=run_file_meta, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_file_meta.add_argument('json_file', + sub_file_meta.add_argument( + "json_file", help="File with jsonlines from file_meta schema to import from", - default=sys.stdin, type=argparse.FileType('r')) + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub_fileset = subparsers.add_parser('fileset', - help="generic fileset importer") + sub_fileset = subparsers.add_parser("fileset", help="generic fileset importer") sub_fileset.set_defaults( func=run_fileset, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_fileset.add_argument('json_file', + sub_fileset.add_argument( + "json_file", help="File with jsonlines of fileset entities to import", - default=sys.stdin, type=argparse.FileType('r')) - sub_fileset.add_argument('--skip-release-fileset-check', - action='store_true', - help="create without checking if releases already have related filesets") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_fileset.add_argument( + "--skip-release-fileset-check", + action="store_true", + help="create without checking if releases already have related filesets", + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -889,15 +1118,18 @@ def main(): # allow editgroup description override via env variable (but CLI arg takes # precedence) - if not args.editgroup_description_override \ - and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'): - args.editgroup_description_override = os.environ.get('FATCAT_EDITGROUP_DESCRIPTION') + if not args.editgroup_description_override and os.environ.get( + "FATCAT_EDITGROUP_DESCRIPTION" + ): + args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION") args.api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) - token=os.environ.get(args.auth_var)) + token=os.environ.get(args.auth_var), + ) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py index 165e42f3..21597fae 100755 --- a/python/fatcat_ingest.py +++ b/python/fatcat_ingest.py @@ -42,51 +42,56 @@ def _run_search_dump(args, search): else: ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env) if args.enqueue_kafka: - print("Will send ingest requests to kafka topic: {}".format(ingest_file_request_topic), file=sys.stderr) + print( + "Will send ingest requests to kafka topic: {}".format(ingest_file_request_topic), + file=sys.stderr, + ) kafka_producer = simple_kafka_producer(args.kafka_hosts) if args.limit is not None: - search = search[:args.limit] + search = search[: args.limit] if args.before_year: - search = search \ - .filter("exists", field="release_year") \ - .filter("range", release_date=dict(lt=args.before_year)) + search = search.filter("exists", field="release_year").filter( + "range", release_date=dict(lt=args.before_year) + ) if args.after_year: - search = search \ - .filter("exists", field="release_year") \ - .filter("range", release_date=dict(gte=args.after_year)) + search = search.filter("exists", field="release_year").filter( + "range", release_date=dict(gte=args.after_year) + ) if not args.allow_non_oa: search = search.filter("term", is_oa=True) if args.release_types: - release_types = args.release_types.split(',') - search = search \ - .filter("terms", release_type=release_types) + release_types = args.release_types.split(",") + search = search.filter("terms", release_type=release_types) else: - search = search \ - .filter("bool", must_not=[ - Q("terms", release_type=["stub", "component"]) - ]) + search = search.filter( + "bool", must_not=[Q("terms", release_type=["stub", "component"])] + ) - counts = Counter({'ingest_request': 0, 'elasticsearch_release': 0, 'estimate': 0}) + counts = Counter({"ingest_request": 0, "elasticsearch_release": 0, "estimate": 0}) search = search.params() - counts['estimate'] = search.count() - print("Expecting {} release objects in search queries".format(counts['estimate']), file=sys.stderr) + counts["estimate"] = search.count() + print( + "Expecting {} release objects in search queries".format(counts["estimate"]), + file=sys.stderr, + ) # don't try to clean up scroll if we are connected to public server (behind # nginx proxy that disallows DELETE) if args.elasticsearch_endpoint in ( - 'https://search.fatcat.wiki', - 'https://search.qa.fatcat.wiki'): + "https://search.fatcat.wiki", + "https://search.qa.fatcat.wiki", + ): search = search.params(clear_scroll=False) results = search.scan() for esr in results: - if args.limit and counts['ingest_request'] >= args.limit: + if args.limit and counts["ingest_request"] >= args.limit: break - counts['elasticsearch_release'] += 1 + counts["elasticsearch_release"] += 1 release = args.api.get_release(esr.ident) ingest_request = release_ingest_request( release, @@ -96,18 +101,18 @@ def _run_search_dump(args, search): if not ingest_request: continue if args.force_recrawl: - ingest_request['force_recrawl'] = True - counts['ingest_request'] += 1 + ingest_request["force_recrawl"] = True + counts["ingest_request"] += 1 if args.dry_run: continue if kafka_producer is not None: kafka_producer.produce( ingest_file_request_topic, - json.dumps(ingest_request).encode('utf-8'), - #key=None, + json.dumps(ingest_request).encode("utf-8"), + # key=None, on_delivery=kafka_fail_fast, ) - counts['kafka'] += 1 + counts["kafka"] += 1 else: print(json.dumps(ingest_request)) if kafka_producer is not None: @@ -138,7 +143,9 @@ def run_ingest_container(args): elif args.name: search = search.query("match", container_name=args.name) else: - print("You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr) + print( + "You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr + ) sys.exit(-1) return _run_search_dump(args, search) @@ -150,8 +157,9 @@ def run_ingest_query(args): way as searches in the fatcat web interface. """ - search = _init_search(args) \ - .filter("term", in_ia=False) \ + search = ( + _init_search(args) + .filter("term", in_ia=False) .query( "query_string", query=args.query, @@ -160,6 +168,7 @@ def run_ingest_query(args): lenient=True, fields=["title^5", "contrib_names^2", "container_title"], ) + ) return _run_search_dump(args, search) @@ -169,86 +178,96 @@ def run_ingest_extid(args): Selects release entities where the external identifier (extid) exists """ - search = _init_search(args) \ - .filter("term", in_ia=False) \ - .filter("exists", field=args.extid) + search = _init_search(args).filter("term", in_ia=False).filter("exists", field=args.extid) return _run_search_dump(args, search) def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--fatcat-api-url', - default="http://localhost:9411/v0", - help="connect to this host/port") - parser.add_argument('--enqueue-kafka', - action='store_true', - help="send ingest requests directly to sandcrawler kafka topic for processing") - parser.add_argument('--kafka-hosts', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument( + "--enqueue-kafka", + action="store_true", + help="send ingest requests directly to sandcrawler kafka topic for processing", + ) + parser.add_argument( + "--kafka-hosts", default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--kafka-request-topic', - help="exact Kafka ingest request topic to use") - parser.add_argument('--elasticsearch-endpoint', + help="list of Kafka brokers (host/port) to use", + ) + parser.add_argument("--kafka-request-topic", help="exact Kafka ingest request topic to use") + parser.add_argument( + "--elasticsearch-endpoint", default="https://search.fatcat.wiki", - help="elasticsearch API. internal endpoint preferred, but public is default") - parser.add_argument('--elasticsearch-index', - default="fatcat_release", - help="elasticsearch index to query") - parser.add_argument('--env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa, dev)") - parser.add_argument('--limit', - default=None, - type=int, - help="Max number of search hits to return") - parser.add_argument('--dry-run', - action='store_true', - help="runs through creating all ingest requests, but doesn't actually output or enqueue") - parser.add_argument('--before-year', + help="elasticsearch API. internal endpoint preferred, but public is default", + ) + parser.add_argument( + "--elasticsearch-index", default="fatcat_release", help="elasticsearch index to query" + ) + parser.add_argument( + "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)" + ) + parser.add_argument( + "--limit", default=None, type=int, help="Max number of search hits to return" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="runs through creating all ingest requests, but doesn't actually output or enqueue", + ) + parser.add_argument( + "--before-year", type=str, - help="filters results to only with release_year before this (not inclusive)") - parser.add_argument('--after-year', + help="filters results to only with release_year before this (not inclusive)", + ) + parser.add_argument( + "--after-year", type=str, - help="filters results to only with release_year after this (inclusive)") - parser.add_argument('--release-types', + help="filters results to only with release_year after this (inclusive)", + ) + parser.add_argument( + "--release-types", type=str, - help="filters results to specified release-types, separated by commas. By default, 'stub' is filtered out.") - parser.add_argument('--allow-non-oa', - action='store_true', - help="By default, we limit to OA releases. This removes that filter") - parser.add_argument('--force-recrawl', - action='store_true', - help="Tell ingest worker to skip GWB history lookup and do SPNv2 crawl") - parser.add_argument('--ingest-type', - default="pdf", - help="What medium to ingest (pdf, xml, html)") + help="filters results to specified release-types, separated by commas. By default, 'stub' is filtered out.", + ) + parser.add_argument( + "--allow-non-oa", + action="store_true", + help="By default, we limit to OA releases. This removes that filter", + ) + parser.add_argument( + "--force-recrawl", + action="store_true", + help="Tell ingest worker to skip GWB history lookup and do SPNv2 crawl", + ) + parser.add_argument( + "--ingest-type", default="pdf", help="What medium to ingest (pdf, xml, html)" + ) subparsers = parser.add_subparsers() - sub_container = subparsers.add_parser('container', - help="Create ingest requests for releases from a specific container") + sub_container = subparsers.add_parser( + "container", help="Create ingest requests for releases from a specific container" + ) sub_container.set_defaults(func=run_ingest_container) - sub_container.add_argument('--container-id', - help="fatcat container entity ident") - sub_container.add_argument('--issnl', - help="ISSN-L of container entity") - sub_container.add_argument('--publisher', - help="publisher name") - sub_container.add_argument('--name', - help="container name") - - sub_query = subparsers.add_parser('query', - help="Create ingest requests for releases from a specific query") + sub_container.add_argument("--container-id", help="fatcat container entity ident") + sub_container.add_argument("--issnl", help="ISSN-L of container entity") + sub_container.add_argument("--publisher", help="publisher name") + sub_container.add_argument("--name", help="container name") + + sub_query = subparsers.add_parser( + "query", help="Create ingest requests for releases from a specific query" + ) sub_query.set_defaults(func=run_ingest_query) - sub_query.add_argument('query', - help="search query (same DSL as web interface search)") + sub_query.add_argument("query", help="search query (same DSL as web interface search)") - sub_extid = subparsers.add_parser('extid', - help="Create ingest requests for releases that have given extid defined") + sub_extid = subparsers.add_parser( + "extid", help="Create ingest requests for releases that have given extid defined" + ) sub_extid.set_defaults(func=run_ingest_extid) - sub_extid.add_argument('extid', - help="extid short name (as included in ES release schema)") + sub_extid.add_argument("extid", help="extid short name (as included in ES release schema)") args = parser.parse_args() if not args.__dict__.get("func"): @@ -258,5 +277,6 @@ def main(): args.api = public_api(args.fatcat_api_url) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_review.py b/python/fatcat_review.py index f719bb46..0cdfc29d 100755 --- a/python/fatcat_review.py +++ b/python/fatcat_review.py @@ -13,39 +13,43 @@ sentry_client = raven.Client() def run_dummy(args): - reviewer = DummyReviewBot(args.api, poll_interval=args.poll_interval, - verbose=args.verbose) + reviewer = DummyReviewBot(args.api, poll_interval=args.poll_interval, verbose=args.verbose) if args.editgroup: annotation = reviewer.run_single(args.editgroup, args.annotate) print(annotation) elif args.continuous: reviewer.run() + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--verbose', - action='store_true', - help="enable verbose output") - parser.add_argument('--fatcat-api-url', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--verbose", action="store_true", help="enable verbose output") + parser.add_argument( + "--fatcat-api-url", default="http://localhost:9411/v0", - help="fatcat API host/port to use") - parser.add_argument('--poll-interval', + help="fatcat API host/port to use", + ) + parser.add_argument( + "--poll-interval", help="how long to wait between polling (seconds)", - default=10.0, type=float) + default=10.0, + type=float, + ) subparsers = parser.add_subparsers() - sub_dummy = subparsers.add_parser('dummy', - help="example/demonstration review bot") + sub_dummy = subparsers.add_parser("dummy", help="example/demonstration review bot") sub_dummy.set_defaults(func=run_dummy) - sub_dummy.add_argument("--continuous", + sub_dummy.add_argument( + "--continuous", action="store_true", - help="run forever, polling for new reviewable editgroups") - sub_dummy.add_argument("--editgroup", - help="single editgroup ID to review") - sub_dummy.add_argument("--annotate", + help="run forever, polling for new reviewable editgroups", + ) + sub_dummy.add_argument("--editgroup", help="single editgroup ID to review") + sub_dummy.add_argument( + "--annotate", action="store_true", - help="for single editgroups, pushes result as annotation") + help="for single editgroups, pushes result as annotation", + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -58,5 +62,6 @@ def main(): args.api = authenticated_api(args.fatcat_api_url) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py index ab855dbf..fe2e12a6 100755 --- a/python/fatcat_transform.py +++ b/python/fatcat_transform.py @@ -31,10 +31,10 @@ def run_elasticsearch_releases(args): if not line: continue entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) - if entity.state != 'active': + if entity.state != "active": continue - args.json_output.write( - json.dumps(release_to_elasticsearch(entity)) + '\n') + args.json_output.write(json.dumps(release_to_elasticsearch(entity)) + "\n") + def run_elasticsearch_containers(args): es_client = elasticsearch.Elasticsearch(args.fatcat_elasticsearch_url) @@ -44,7 +44,7 @@ def run_elasticsearch_containers(args): if not line: continue entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client) - if entity.state != 'active': + if entity.state != "active": continue if args.query_stats: @@ -60,7 +60,8 @@ def run_elasticsearch_containers(args): else: es_doc = container_to_elasticsearch(entity) - args.json_output.write(json.dumps(es_doc) + '\n') + args.json_output.write(json.dumps(es_doc) + "\n") + def run_elasticsearch_files(args): for line in args.json_input: @@ -68,10 +69,10 @@ def run_elasticsearch_files(args): if not line: continue entity = entity_from_json(line, FileEntity, api_client=args.api.api_client) - if entity.state != 'active': + if entity.state != "active": continue - args.json_output.write( - json.dumps(file_to_elasticsearch(entity)) + '\n') + args.json_output.write(json.dumps(file_to_elasticsearch(entity)) + "\n") + def run_elasticsearch_changelogs(args): for line in args.json_input: @@ -79,8 +80,8 @@ def run_elasticsearch_changelogs(args): if not line: continue entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client) - args.json_output.write( - json.dumps(changelog_to_elasticsearch(entity)) + '\n') + args.json_output.write(json.dumps(changelog_to_elasticsearch(entity)) + "\n") + def run_citeproc_releases(args): for line in args.json_input: @@ -88,82 +89,126 @@ def run_citeproc_releases(args): if not line: continue entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) - if entity.state != 'active': + if entity.state != "active": continue csl_json = release_to_csl(entity) - csl_json['id'] = "release:" + (entity.ident or "unknown") + csl_json["id"] = "release:" + (entity.ident or "unknown") out = citeproc_csl(csl_json, args.style, args.html) args.json_output.write(out + "\n") + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--fatcat-api-url', - default="http://localhost:9411/v0", - help="connect to this host/port") - parser.add_argument('--fatcat-elasticsearch-url', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument( + "--fatcat-elasticsearch-url", default="http://localhost:9200", - help="connect to this host/port") + help="connect to this host/port", + ) subparsers = parser.add_subparsers() - sub_elasticsearch_releases = subparsers.add_parser('elasticsearch-releases', - help="convert fatcat release JSON schema to elasticsearch release schema") + sub_elasticsearch_releases = subparsers.add_parser( + "elasticsearch-releases", + help="convert fatcat release JSON schema to elasticsearch release schema", + ) sub_elasticsearch_releases.set_defaults(func=run_elasticsearch_releases) - sub_elasticsearch_releases.add_argument('json_input', + sub_elasticsearch_releases.add_argument( + "json_input", help="JSON-per-line of release entities", - default=sys.stdin, type=argparse.FileType('r')) - sub_elasticsearch_releases.add_argument('json_output', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_elasticsearch_releases.add_argument( + "json_output", help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) - - sub_elasticsearch_containers = subparsers.add_parser('elasticsearch-containers', - help="convert fatcat container JSON schema to elasticsearch container schema") + default=sys.stdout, + type=argparse.FileType("w"), + ) + + sub_elasticsearch_containers = subparsers.add_parser( + "elasticsearch-containers", + help="convert fatcat container JSON schema to elasticsearch container schema", + ) sub_elasticsearch_containers.set_defaults(func=run_elasticsearch_containers) - sub_elasticsearch_containers.add_argument('json_input', + sub_elasticsearch_containers.add_argument( + "json_input", help="JSON-per-line of container entities", - default=sys.stdin, type=argparse.FileType('r')) - sub_elasticsearch_containers.add_argument('json_output', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_elasticsearch_containers.add_argument( + "json_output", help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) - sub_elasticsearch_containers.add_argument('--query-stats', - action='store_true', - help="whether to query release search index for container stats") - - sub_elasticsearch_files = subparsers.add_parser('elasticsearch-files', - help="convert fatcat file JSON schema to elasticsearch file schema") + default=sys.stdout, + type=argparse.FileType("w"), + ) + sub_elasticsearch_containers.add_argument( + "--query-stats", + action="store_true", + help="whether to query release search index for container stats", + ) + + sub_elasticsearch_files = subparsers.add_parser( + "elasticsearch-files", + help="convert fatcat file JSON schema to elasticsearch file schema", + ) sub_elasticsearch_files.set_defaults(func=run_elasticsearch_files) - sub_elasticsearch_files.add_argument('json_input', + sub_elasticsearch_files.add_argument( + "json_input", help="JSON-per-line of file entities", - default=sys.stdin, type=argparse.FileType('r')) - sub_elasticsearch_files.add_argument('json_output', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_elasticsearch_files.add_argument( + "json_output", help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) - - sub_elasticsearch_changelogs = subparsers.add_parser('elasticsearch-changelogs', - help="convert fatcat changelog JSON schema to elasticsearch changelog schema") + default=sys.stdout, + type=argparse.FileType("w"), + ) + + sub_elasticsearch_changelogs = subparsers.add_parser( + "elasticsearch-changelogs", + help="convert fatcat changelog JSON schema to elasticsearch changelog schema", + ) sub_elasticsearch_changelogs.set_defaults(func=run_elasticsearch_changelogs) - sub_elasticsearch_changelogs.add_argument('json_input', + sub_elasticsearch_changelogs.add_argument( + "json_input", help="JSON-per-line of changelog entries", - default=sys.stdin, type=argparse.FileType('r')) - sub_elasticsearch_changelogs.add_argument('json_output', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_elasticsearch_changelogs.add_argument( + "json_output", help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) - - sub_citeproc_releases = subparsers.add_parser('citeproc-releases', - help="convert fatcat release schema to any standard citation format using citeproc/CSL") + default=sys.stdout, + type=argparse.FileType("w"), + ) + + sub_citeproc_releases = subparsers.add_parser( + "citeproc-releases", + help="convert fatcat release schema to any standard citation format using citeproc/CSL", + ) sub_citeproc_releases.set_defaults(func=run_citeproc_releases) - sub_citeproc_releases.add_argument('json_input', + sub_citeproc_releases.add_argument( + "json_input", help="JSON-per-line of release entities", - default=sys.stdin, type=argparse.FileType('r')) - sub_citeproc_releases.add_argument('json_output', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_citeproc_releases.add_argument( + "json_output", help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) - sub_citeproc_releases.add_argument('--style', - help="citation style to output", - default='csl-json') - sub_citeproc_releases.add_argument('--html', - action='store_true', - help="output HTML, not plain text") + default=sys.stdout, + type=argparse.FileType("w"), + ) + sub_citeproc_releases.add_argument( + "--style", help="citation style to output", default="csl-json" + ) + sub_citeproc_releases.add_argument( + "--html", action="store_true", help="output HTML, not plain text" + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -173,5 +218,6 @@ def main(): args.api = public_api(args.fatcat_api_url) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_util.py b/python/fatcat_util.py index a8e99ac3..57102e9e 100755 --- a/python/fatcat_util.py +++ b/python/fatcat_util.py @@ -16,47 +16,50 @@ from fatcat_tools import authenticated_api, fcid2uuid, uuid2fcid def run_uuid2fcid(args): print(uuid2fcid(args.uuid)) + def run_fcid2uuid(args): print(fcid2uuid(args.fcid)) + def run_editgroup_accept(args): args.api.accept_editgroup(args.editgroup_id) + def run_editgroup_submit(args): eg = args.api.get_editgroup(args.editgroup_id) args.api.update_editgroup(args.editgroup_id, eg, submit=True) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--fatcat-api-url', - default="http://localhost:9411/v0", - help="connect to this host/port") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) subparsers = parser.add_subparsers() - sub_uuid2fcid = subparsers.add_parser('uuid2fcid', - help="convert a standard UUID (as string) to fatcat ident format") + sub_uuid2fcid = subparsers.add_parser( + "uuid2fcid", help="convert a standard UUID (as string) to fatcat ident format" + ) sub_uuid2fcid.set_defaults(func=run_uuid2fcid) - sub_uuid2fcid.add_argument('uuid', - help="UUID to transform") + sub_uuid2fcid.add_argument("uuid", help="UUID to transform") - sub_fcid2uuid = subparsers.add_parser('fcid2uuid', - help="convert a fatcat ident string to standard UUID format") + sub_fcid2uuid = subparsers.add_parser( + "fcid2uuid", help="convert a fatcat ident string to standard UUID format" + ) sub_fcid2uuid.set_defaults(func=run_fcid2uuid) - sub_fcid2uuid.add_argument('fcid', - help="FCID to transform (into UUID)") + sub_fcid2uuid.add_argument("fcid", help="FCID to transform (into UUID)") - sub_editgroup_accept = subparsers.add_parser('editgroup-accept', - help="accept an editgroup (by ident)") + sub_editgroup_accept = subparsers.add_parser( + "editgroup-accept", help="accept an editgroup (by ident)" + ) sub_editgroup_accept.set_defaults(func=run_editgroup_accept) - sub_editgroup_accept.add_argument('editgroup_id', - help="editgroup to accept") + sub_editgroup_accept.add_argument("editgroup_id", help="editgroup to accept") - sub_editgroup_submit = subparsers.add_parser('editgroup-submit', - help="submit an editgroup for review (by ident)") + sub_editgroup_submit = subparsers.add_parser( + "editgroup-submit", help="submit an editgroup for review (by ident)" + ) sub_editgroup_submit.set_defaults(func=run_editgroup_submit) - sub_editgroup_submit.add_argument('editgroup_id', - help="editgroup to submit") + sub_editgroup_submit.add_argument("editgroup_id", help="editgroup to submit") args = parser.parse_args() if not args.__dict__.get("func"): @@ -66,5 +69,6 @@ def main(): args.api = authenticated_api(args.fatcat_api_url) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_webface.py b/python/fatcat_webface.py index d12e8dad..acaa5936 100755 --- a/python/fatcat_webface.py +++ b/python/fatcat_webface.py @@ -6,21 +6,18 @@ from fatcat_web import app def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--debug', - action='store_true', - help="enable debugging interface (note: not for everything)") - parser.add_argument('--host', - default="127.0.0.1", - help="listen on this host/IP") - parser.add_argument('--port', - type=int, - default=9810, - help="listen on this port") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--debug", + action="store_true", + help="enable debugging interface (note: not for everything)", + ) + parser.add_argument("--host", default="127.0.0.1", help="listen on this host/IP") + parser.add_argument("--port", type=int, default=9810, help="listen on this port") args = parser.parse_args() app.run(debug=args.debug, host=args.host, port=args.port) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index 397cf731..b776e0ce 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -20,10 +20,12 @@ sentry_client = raven.Client() def run_changelog(args): topic = "fatcat-{}.changelog".format(args.env) - worker = ChangelogWorker(args.api, args.kafka_hosts, topic, - poll_interval=args.poll_interval) + worker = ChangelogWorker( + args.api, args.kafka_hosts, topic, poll_interval=args.poll_interval + ) worker.run() + def run_entity_updates(args): changelog_topic = "fatcat-{}.changelog".format(args.env) release_topic = "fatcat-{}.release-updates-v03".format(args.env) @@ -31,7 +33,9 @@ def run_entity_updates(args): container_topic = "fatcat-{}.container-updates".format(args.env) work_ident_topic = "fatcat-{}.work-ident-updates".format(args.env) ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env) - worker = EntityUpdatesWorker(args.api, args.kafka_hosts, + worker = EntityUpdatesWorker( + args.api, + args.kafka_hosts, changelog_topic, release_topic=release_topic, file_topic=file_topic, @@ -41,86 +45,126 @@ def run_entity_updates(args): ) worker.run() + def run_elasticsearch_release(args): consume_topic = "fatcat-{}.release-updates-v03".format(args.env) - worker = ElasticsearchReleaseWorker(args.kafka_hosts, consume_topic, + worker = ElasticsearchReleaseWorker( + args.kafka_hosts, + consume_topic, elasticsearch_backend=args.elasticsearch_backend, - elasticsearch_index=args.elasticsearch_index) + elasticsearch_index=args.elasticsearch_index, + ) worker.run() + def run_elasticsearch_container(args): consume_topic = "fatcat-{}.container-updates".format(args.env) - worker = ElasticsearchContainerWorker(args.kafka_hosts, consume_topic, + worker = ElasticsearchContainerWorker( + args.kafka_hosts, + consume_topic, query_stats=args.query_stats, elasticsearch_release_index="fatcat_release", elasticsearch_backend=args.elasticsearch_backend, - elasticsearch_index=args.elasticsearch_index) + elasticsearch_index=args.elasticsearch_index, + ) worker.run() + def run_elasticsearch_changelog(args): consume_topic = "fatcat-{}.changelog".format(args.env) - worker = ElasticsearchChangelogWorker(args.kafka_hosts, consume_topic, + worker = ElasticsearchChangelogWorker( + args.kafka_hosts, + consume_topic, elasticsearch_backend=args.elasticsearch_backend, - elasticsearch_index=args.elasticsearch_index) + elasticsearch_index=args.elasticsearch_index, + ) worker.run() + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--api-host-url', - default="http://localhost:9411/v0", - help="fatcat API host/port to use") - parser.add_argument('--kafka-hosts', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--api-host-url", default="http://localhost:9411/v0", help="fatcat API host/port to use" + ) + parser.add_argument( + "--kafka-hosts", default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa, dev)") + help="list of Kafka brokers (host/port) to use", + ) + parser.add_argument( + "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)" + ) subparsers = parser.add_subparsers() - sub_changelog = subparsers.add_parser('changelog', - help="poll fatcat API for changelog entries, push to kafka") + sub_changelog = subparsers.add_parser( + "changelog", help="poll fatcat API for changelog entries, push to kafka" + ) sub_changelog.set_defaults(func=run_changelog) - sub_changelog.add_argument('--poll-interval', + sub_changelog.add_argument( + "--poll-interval", help="how long to wait between polling (seconds)", - default=5.0, type=float) + default=5.0, + type=float, + ) - sub_entity_updates = subparsers.add_parser('entity-updates', - help="poll kafka for changelog entries; push entity changes to various kafka topics") + sub_entity_updates = subparsers.add_parser( + "entity-updates", + help="poll kafka for changelog entries; push entity changes to various kafka topics", + ) sub_entity_updates.set_defaults(func=run_entity_updates) - sub_elasticsearch_release = subparsers.add_parser('elasticsearch-release', - help="consume kafka feed of new/updated releases, transform and push to search") + sub_elasticsearch_release = subparsers.add_parser( + "elasticsearch-release", + help="consume kafka feed of new/updated releases, transform and push to search", + ) sub_elasticsearch_release.set_defaults(func=run_elasticsearch_release) - sub_elasticsearch_release.add_argument('--elasticsearch-backend', + sub_elasticsearch_release.add_argument( + "--elasticsearch-backend", help="elasticsearch backend to connect to", - default="http://localhost:9200") - sub_elasticsearch_release.add_argument('--elasticsearch-index', + default="http://localhost:9200", + ) + sub_elasticsearch_release.add_argument( + "--elasticsearch-index", help="elasticsearch index to push into", - default="fatcat_release_v03") + default="fatcat_release_v03", + ) - sub_elasticsearch_container = subparsers.add_parser('elasticsearch-container', - help="consume kafka feed of new/updated containers, transform and push to search") + sub_elasticsearch_container = subparsers.add_parser( + "elasticsearch-container", + help="consume kafka feed of new/updated containers, transform and push to search", + ) sub_elasticsearch_container.set_defaults(func=run_elasticsearch_container) - sub_elasticsearch_container.add_argument('--elasticsearch-backend', + sub_elasticsearch_container.add_argument( + "--elasticsearch-backend", help="elasticsearch backend to connect to", - default="http://localhost:9200") - sub_elasticsearch_container.add_argument('--elasticsearch-index', + default="http://localhost:9200", + ) + sub_elasticsearch_container.add_argument( + "--elasticsearch-index", help="elasticsearch index to push into", - default="fatcat_container") - sub_elasticsearch_container.add_argument('--query-stats', - action='store_true', - help="whether to query release search index for container stats") + default="fatcat_container", + ) + sub_elasticsearch_container.add_argument( + "--query-stats", + action="store_true", + help="whether to query release search index for container stats", + ) - sub_elasticsearch_changelog = subparsers.add_parser('elasticsearch-changelog', - help="consume changelog kafka feed, transform and push to search") + sub_elasticsearch_changelog = subparsers.add_parser( + "elasticsearch-changelog", + help="consume changelog kafka feed, transform and push to search", + ) sub_elasticsearch_changelog.set_defaults(func=run_elasticsearch_changelog) - sub_elasticsearch_changelog.add_argument('--elasticsearch-backend', + sub_elasticsearch_changelog.add_argument( + "--elasticsearch-backend", help="elasticsearch backend to connect to", - default="http://localhost:9200") - sub_elasticsearch_changelog.add_argument('--elasticsearch-index', + default="http://localhost:9200", + ) + sub_elasticsearch_changelog.add_argument( + "--elasticsearch-index", help="elasticsearch index to push into", - default="fatcat_changelog") + default="fatcat_changelog", + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -130,5 +174,6 @@ def main(): args.api = public_api(args.api_host_url) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/shell.py b/python/shell.py index 9b561f5f..e8879a70 100644 --- a/python/shell.py +++ b/python/shell.py @@ -12,9 +12,9 @@ from fatcat_openapi_client.rest import ApiException from fatcat_tools import * -if __name__ == '__main__': +if __name__ == "__main__": - #api = + # api = print(" __ _ _ _ ") print(" / _| __ _| |_ ___ __ _| |_| |") print("| |_ / _` | __/ __/ _` | __| |") @@ -24,23 +24,27 @@ if __name__ == '__main__': admin_id = "aaaaaaaaaaaabkvkaaaaaaaaae" - #fatcat_openapi_client.configuration.api_key['Authorization'] = 'YOUR_API_KEY' - #fatcat_openapi_client.configuration.api_key_prefix['Authorization'] = 'Bearer' + # fatcat_openapi_client.configuration.api_key['Authorization'] = 'YOUR_API_KEY' + # fatcat_openapi_client.configuration.api_key_prefix['Authorization'] = 'Bearer' local_conf = fatcat_openapi_client.Configuration() - local_conf.api_key["Authorization"] = "AgEPZGV2LmZhdGNhdC53aWtpAhYyMDE5MDEwMS1kZXYtZHVtbXkta2V5AAImZWRpdG9yX2lkID0gYWFhYWFhYWFhYWFhYmt2a2FhYWFhYWFhYWkAAht0aW1lID4gMjAxOS0wMS0wOVQwMDo1Nzo1MloAAAYgnroNha1hSftChtxHGTnLEmM/pY8MeQS/jBSV0UNvXug=" + local_conf.api_key[ + "Authorization" + ] = "AgEPZGV2LmZhdGNhdC53aWtpAhYyMDE5MDEwMS1kZXYtZHVtbXkta2V5AAImZWRpdG9yX2lkID0gYWFhYWFhYWFhYWFhYmt2a2FhYWFhYWFhYWkAAht0aW1lID4gMjAxOS0wMS0wOVQwMDo1Nzo1MloAAAYgnroNha1hSftChtxHGTnLEmM/pY8MeQS/jBSV0UNvXug=" local_conf.api_key_prefix["Authorization"] = "Bearer" - local_conf.host = 'http://localhost:9411/v0' + local_conf.host = "http://localhost:9411/v0" local_conf.debug = True local_api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(local_conf)) - #prod_conf = fatcat_openapi_client.Configuration() - #prod_conf.api_key["Authorization"] = "AgEPZGV2LmZhdGNhdC53aWtpAg4yMDE4LTEyLTMxLWRldgACJmVkaXRvcl9pZCA9IGFhYWFhYWFhYWFhYWJrdmthYWFhYWFhYWFlAAIeY3JlYXRlZCA9IDIwMTgtMTItMzFUMjE6MTU6NDdaAAAGIMWFZeZ54pH4OzNl5+U5X3p1H1rMioSuIldihuiM5XAw" - #prod_conf.api_key_prefix["Authorization"] = "Bearer" - #prod_conf.host = 'https://api.fatcat.wiki/v0' - #prod_api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(prod_conf)) + # prod_conf = fatcat_openapi_client.Configuration() + # prod_conf.api_key["Authorization"] = "AgEPZGV2LmZhdGNhdC53aWtpAg4yMDE4LTEyLTMxLWRldgACJmVkaXRvcl9pZCA9IGFhYWFhYWFhYWFhYWJrdmthYWFhYWFhYWFlAAIeY3JlYXRlZCA9IDIwMTgtMTItMzFUMjE6MTU6NDdaAAAGIMWFZeZ54pH4OzNl5+U5X3p1H1rMioSuIldihuiM5XAw" + # prod_conf.api_key_prefix["Authorization"] = "Bearer" + # prod_conf.host = 'https://api.fatcat.wiki/v0' + # prod_api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(prod_conf)) qa_conf = fatcat_openapi_client.Configuration() - qa_conf.api_key["Authorization"] = "AgEPZGV2LmZhdGNhdC53aWtpAg4yMDE4LTEyLTMxLWRldgACJmVkaXRvcl9pZCA9IGFhYWFhYWFhYWFhYWJrdmthYWFhYWFhYWFlAAIeY3JlYXRlZCA9IDIwMTgtMTItMzFUMjE6MTU6NDdaAAAGIMWFZeZ54pH4OzNl5+U5X3p1H1rMioSuIldihuiM5XAw" + qa_conf.api_key[ + "Authorization" + ] = "AgEPZGV2LmZhdGNhdC53aWtpAg4yMDE4LTEyLTMxLWRldgACJmVkaXRvcl9pZCA9IGFhYWFhYWFhYWFhYWJrdmthYWFhYWFhYWFlAAIeY3JlYXRlZCA9IDIwMTgtMTItMzFUMjE6MTU6NDdaAAAGIMWFZeZ54pH4OzNl5+U5X3p1H1rMioSuIldihuiM5XAw" qa_conf.api_key_prefix["Authorization"] = "Bearer" - qa_conf.host = 'https://api.qa.fatcat.wiki/v0' + qa_conf.host = "https://api.qa.fatcat.wiki/v0" qa_api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(qa_conf)) |