aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rwxr-xr-xpython/fatcat_cleanup.py58
-rwxr-xr-xpython/fatcat_export.py56
-rwxr-xr-xpython/fatcat_harvest.py94
-rwxr-xr-xpython/fatcat_import.py950
-rwxr-xr-xpython/fatcat_ingest.py210
-rwxr-xr-xpython/fatcat_review.py45
-rwxr-xr-xpython/fatcat_transform.py172
-rwxr-xr-xpython/fatcat_util.py48
-rwxr-xr-xpython/fatcat_webface.py23
-rwxr-xr-xpython/fatcat_worker.py139
-rw-r--r--python/shell.py30
11 files changed, 1110 insertions, 715 deletions
diff --git a/python/fatcat_cleanup.py b/python/fatcat_cleanup.py
index 04ee2bd9..f8030b16 100755
--- a/python/fatcat_cleanup.py
+++ b/python/fatcat_cleanup.py
@@ -15,38 +15,45 @@ sentry_client = raven.Client()
def run_files(args):
- fmi = FileCleaner(args.api,
+ fmi = FileCleaner(
+ args.api,
dry_run_mode=args.dry_run,
edit_batch_size=args.batch_size,
- editgroup_description=args.editgroup_description_override)
+ editgroup_description=args.editgroup_description_override,
+ )
JsonLinePusher(fmi, args.json_file).run()
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--fatcat-api-url',
- default="http://localhost:9411/v0",
- help="connect to this host/port")
- parser.add_argument('--batch-size',
- help="size of batch to send",
- default=50, type=int)
- parser.add_argument('--editgroup-description-override',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+ parser.add_argument(
+ "--editgroup-description-override",
help="editgroup description override",
- default=None, type=str)
- parser.add_argument('--dry-run',
- help="dry-run mode (don't actually update)",
- default=False, type=bool)
+ default=None,
+ type=str,
+ )
+ parser.add_argument(
+ "--dry-run", help="dry-run mode (don't actually update)", default=False, type=bool
+ )
subparsers = parser.add_subparsers()
- sub_files = subparsers.add_parser('files',
- help="attempt metadata cleanups over a list of file entities")
+ sub_files = subparsers.add_parser(
+ "files", help="attempt metadata cleanups over a list of file entities"
+ )
sub_files.set_defaults(
func=run_files,
auth_var="FATCAT_AUTH_WORKER_CLEANUP",
)
- sub_files.add_argument('json_file',
+ sub_files.add_argument(
+ "json_file",
help="files JSON file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -55,15 +62,18 @@ def main():
# allow editgroup description override via env variable (but CLI arg takes
# precedence)
- if not args.editgroup_description_override \
- and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'):
- args.editgroup_description_override = os.environ.get('FATCAT_EDITGROUP_DESCRIPTION')
+ if not args.editgroup_description_override and os.environ.get(
+ "FATCAT_EDITGROUP_DESCRIPTION"
+ ):
+ args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION")
args.api = authenticated_api(
args.fatcat_api_url,
# token is an optional kwarg (can be empty string, None, etc)
- token=os.environ.get(args.auth_var))
+ token=os.environ.get(args.auth_var),
+ )
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_export.py b/python/fatcat_export.py
index 9a7cfd01..ebdc5af8 100755
--- a/python/fatcat_export.py
+++ b/python/fatcat_export.py
@@ -19,7 +19,9 @@ def run_export_releases(args):
ident = uuid2fcid(line.split()[0])
release = args.api.get_release(ident=ident, expand="all")
args.json_output.write(
- json.dumps(entity_to_dict(release), api_client=args.api.api_client) + "\n")
+ json.dumps(entity_to_dict(release), api_client=args.api.api_client) + "\n"
+ )
+
def run_export_changelog(args):
end = args.end
@@ -30,36 +32,47 @@ def run_export_changelog(args):
for i in range(args.start, end):
entry = args.api.get_changelog_entry(index=i)
args.json_output.write(
- json.dumps(entity_to_dict(entry, api_client=args.api.api_client)) + "\n")
+ json.dumps(entity_to_dict(entry, api_client=args.api.api_client)) + "\n"
+ )
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--fatcat-api-url',
- default="http://localhost:9411/v0",
- help="connect to this host/port")
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
subparsers = parser.add_subparsers()
- sub_releases = subparsers.add_parser('releases')
+ sub_releases = subparsers.add_parser("releases")
sub_releases.set_defaults(func=run_export_releases)
- sub_releases.add_argument('ident_file',
+ sub_releases.add_argument(
+ "ident_file",
help="TSV list of fatcat release idents to dump",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_releases.add_argument('json_output',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_releases.add_argument(
+ "json_output",
help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
+ default=sys.stdout,
+ type=argparse.FileType("w"),
+ )
- sub_changelog = subparsers.add_parser('changelog')
+ sub_changelog = subparsers.add_parser("changelog")
sub_changelog.set_defaults(func=run_export_changelog)
- sub_changelog.add_argument('--start',
- help="index to start dumping at",
- default=1, type=int)
- sub_changelog.add_argument('--end',
+ sub_changelog.add_argument("--start", help="index to start dumping at", default=1, type=int)
+ sub_changelog.add_argument(
+ "--end",
help="index to stop dumping at (else detect most recent)",
- default=None, type=int)
- sub_changelog.add_argument('json_output',
+ default=None,
+ type=int,
+ )
+ sub_changelog.add_argument(
+ "json_output",
help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
+ default=sys.stdout,
+ type=argparse.FileType("w"),
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -69,5 +82,6 @@ def main():
args.api = public_api(args.fatcat_api_url)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py
index 0324aa52..91356aad 100755
--- a/python/fatcat_harvest.py
+++ b/python/fatcat_harvest.py
@@ -26,9 +26,11 @@ def run_crossref(args):
state_topic=f"fatcat-{args.env}.api-crossref-state",
contact_email=args.contact_email,
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_datacite(args):
worker = HarvestDataciteWorker(
kafka_hosts=args.kafka_hosts,
@@ -36,93 +38,108 @@ def run_datacite(args):
state_topic=f"fatcat-{args.env}.api-datacite-state",
contact_email=args.contact_email,
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_arxiv(args):
worker = HarvestArxivWorker(
kafka_hosts=args.kafka_hosts,
produce_topic=f"fatcat-{args.env}.oaipmh-arxiv",
state_topic=f"fatcat-{args.env}.oaipmh-arxiv-state",
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_pubmed(args):
worker = PubmedFTPWorker(
kafka_hosts=args.kafka_hosts,
produce_topic=f"fatcat-{args.env}.ftp-pubmed",
state_topic=f"fatcat-{args.env}.ftp-pubmed-state",
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_doaj_article(args):
worker = HarvestDoajArticleWorker(
kafka_hosts=args.kafka_hosts,
produce_topic=f"fatcat-{args.env}.oaipmh-doaj-article",
state_topic="fatcat-{args.env}.oaipmh-doaj-article-state",
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
+
def run_doaj_journal(args):
worker = HarvestDoajJournalWorker(
kafka_hosts=args.kafka_hosts,
produce_topic=f"fatcat-{args.env}.oaipmh-doaj-journal",
state_topic=f"fatcat-{args.env}.oaipmh-doaj-journal-state",
start_date=args.start_date,
- end_date=args.end_date)
+ end_date=args.end_date,
+ )
worker.run(continuous=args.continuous)
def mkdate(raw):
return datetime.datetime.strptime(raw, "%Y-%m-%d").date()
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('--start-date',
- default=None, type=mkdate,
- help="beginning of harvest period")
- parser.add_argument('--end-date',
- default=None, type=mkdate,
- help="end of harvest period")
- parser.add_argument('--contact-email',
- default="undefined", # better?
- help="contact email to use in API header")
- parser.add_argument('--continuous',
- action='store_true',
- help="continue harvesting indefinitely in a loop?")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "--start-date", default=None, type=mkdate, help="beginning of harvest period"
+ )
+ parser.add_argument("--end-date", default=None, type=mkdate, help="end of harvest period")
+ parser.add_argument(
+ "--contact-email",
+ default="undefined", # better?
+ help="contact email to use in API header",
+ )
+ parser.add_argument(
+ "--continuous", action="store_true", help="continue harvesting indefinitely in a loop?"
+ )
subparsers = parser.add_subparsers()
- sub_crossref = subparsers.add_parser('crossref',
- help="harvest DOI metadata from Crossref API (JSON)")
+ sub_crossref = subparsers.add_parser(
+ "crossref", help="harvest DOI metadata from Crossref API (JSON)"
+ )
sub_crossref.set_defaults(func=run_crossref)
- sub_datacite = subparsers.add_parser('datacite',
- help="harvest DOI metadata from Datacite API (JSON)")
+ sub_datacite = subparsers.add_parser(
+ "datacite", help="harvest DOI metadata from Datacite API (JSON)"
+ )
sub_datacite.set_defaults(func=run_datacite)
- sub_arxiv = subparsers.add_parser('arxiv',
- help="harvest metadata from arxiv.org OAI-PMH endpoint (XML)")
+ sub_arxiv = subparsers.add_parser(
+ "arxiv", help="harvest metadata from arxiv.org OAI-PMH endpoint (XML)"
+ )
sub_arxiv.set_defaults(func=run_arxiv)
- sub_pubmed = subparsers.add_parser('pubmed',
- help="harvest MEDLINE/PubMed metadata from daily FTP updates (XML)")
+ sub_pubmed = subparsers.add_parser(
+ "pubmed", help="harvest MEDLINE/PubMed metadata from daily FTP updates (XML)"
+ )
sub_pubmed.set_defaults(func=run_pubmed)
# DOAJ stuff disabled because API range-requests are broken
- #sub_doaj_article = subparsers.add_parser('doaj-article')
- #sub_doaj_article.set_defaults(func=run_doaj_article)
- #sub_doaj_journal = subparsers.add_parser('doaj-journal')
- #sub_doaj_journal.set_defaults(func=run_doaj_journal)
+ # sub_doaj_article = subparsers.add_parser('doaj-article')
+ # sub_doaj_article.set_defaults(func=run_doaj_article)
+ # sub_doaj_journal = subparsers.add_parser('doaj-journal')
+ # sub_doaj_journal.set_defaults(func=run_doaj_journal)
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -130,5 +147,6 @@ def main():
sys.exit(-1)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 3225688c..42001974 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -51,11 +51,13 @@ sentry_client = raven.Client()
def run_crossref(args):
- fci = CrossrefImporter(args.api,
+ fci = CrossrefImporter(
+ args.api,
args.issn_map_file,
extid_map_file=args.extid_map_file,
edit_batch_size=args.batch_size,
- bezerk_mode=args.bezerk_mode)
+ bezerk_mode=args.bezerk_mode,
+ )
if args.kafka_mode:
KafkaJsonPusher(
fci,
@@ -68,15 +70,14 @@ def run_crossref(args):
else:
JsonLinePusher(fci, args.json_file).run()
+
def run_jalc(args):
- ji = JalcImporter(args.api,
- args.issn_map_file,
- extid_map_file=args.extid_map_file)
+ ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file)
Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run()
+
def run_arxiv(args):
- ari = ArxivRawImporter(args.api,
- edit_batch_size=args.batch_size)
+ ari = ArxivRawImporter(args.api, edit_batch_size=args.batch_size)
if args.kafka_mode:
KafkaBs4XmlPusher(
ari,
@@ -87,15 +88,18 @@ def run_arxiv(args):
).run()
else:
if args.xml_file == sys.stdin:
- print('note: reading from stdin', file=sys.stderr)
+ print("note: reading from stdin", file=sys.stderr)
Bs4XmlFilePusher(ari, args.xml_file, "record").run()
+
def run_pubmed(args):
- pi = PubmedImporter(args.api,
+ pi = PubmedImporter(
+ args.api,
args.issn_map_file,
edit_batch_size=args.batch_size,
do_updates=args.do_updates,
- lookup_refs=(not args.no_lookup_refs))
+ lookup_refs=(not args.no_lookup_refs),
+ )
if args.kafka_mode:
KafkaBs4XmlPusher(
pi,
@@ -111,62 +115,67 @@ def run_pubmed(args):
["PubmedArticle"],
).run()
+
def run_jstor(args):
- ji = JstorImporter(args.api,
- args.issn_map_file,
- edit_batch_size=args.batch_size)
+ ji = JstorImporter(args.api, args.issn_map_file, edit_batch_size=args.batch_size)
Bs4XmlFileListPusher(ji, args.list_file, "article").run()
+
def run_orcid(args):
- foi = OrcidImporter(args.api,
- edit_batch_size=args.batch_size)
+ foi = OrcidImporter(args.api, edit_batch_size=args.batch_size)
JsonLinePusher(foi, args.json_file).run()
+
def run_journal_metadata(args):
- fii = JournalMetadataImporter(args.api,
- edit_batch_size=args.batch_size)
+ fii = JournalMetadataImporter(args.api, edit_batch_size=args.batch_size)
JsonLinePusher(fii, args.json_file).run()
+
def run_chocula(args):
- fii = ChoculaImporter(args.api,
- edit_batch_size=args.batch_size,
- do_updates=args.do_updates)
+ fii = ChoculaImporter(args.api, edit_batch_size=args.batch_size, do_updates=args.do_updates)
JsonLinePusher(fii, args.json_file).run()
+
def run_matched(args):
- fmi = MatchedImporter(args.api,
+ fmi = MatchedImporter(
+ args.api,
edit_batch_size=args.batch_size,
editgroup_description=args.editgroup_description_override,
default_link_rel=args.default_link_rel,
- default_mimetype=args.default_mimetype)
+ default_mimetype=args.default_mimetype,
+ )
JsonLinePusher(fmi, args.json_file).run()
+
def run_arabesque_match(args):
- if (args.sqlite_file and args.json_file) or not (args.sqlite_file or
- args.json_file):
+ if (args.sqlite_file and args.json_file) or not (args.sqlite_file or args.json_file):
print("Supply one of --sqlite-file or --json-file")
- ami = ArabesqueMatchImporter(args.api,
+ ami = ArabesqueMatchImporter(
+ args.api,
editgroup_description=args.editgroup_description_override,
do_updates=args.do_updates,
require_grobid=(not args.no_require_grobid),
extid_type=args.extid_type,
crawl_id=args.crawl_id,
default_link_rel=args.default_link_rel,
- edit_batch_size=args.batch_size)
+ edit_batch_size=args.batch_size,
+ )
if args.sqlite_file:
- SqlitePusher(ami, args.sqlite_file, "crawl_result",
- ARABESQUE_MATCH_WHERE_CLAUSE).run()
+ SqlitePusher(ami, args.sqlite_file, "crawl_result", ARABESQUE_MATCH_WHERE_CLAUSE).run()
elif args.json_file:
JsonLinePusher(ami, args.json_file).run()
+
def run_ingest_file(args):
- ifri = IngestFileResultImporter(args.api,
+ ifri = IngestFileResultImporter(
+ args.api,
editgroup_description=args.editgroup_description_override,
skip_source_allowlist=args.skip_source_allowlist,
do_updates=args.do_updates,
default_link_rel=args.default_link_rel,
require_grobid=(not args.no_require_grobid),
- edit_batch_size=args.batch_size)
+ edit_batch_size=args.batch_size,
+ )
if args.kafka_mode:
KafkaJsonPusher(
ifri,
@@ -180,13 +189,16 @@ def run_ingest_file(args):
else:
JsonLinePusher(ifri, args.json_file).run()
+
def run_ingest_web(args):
- iwri = IngestWebResultImporter(args.api,
+ iwri = IngestWebResultImporter(
+ args.api,
editgroup_description=args.editgroup_description_override,
skip_source_allowlist=args.skip_source_allowlist,
do_updates=args.do_updates,
default_link_rel=args.default_link_rel,
- edit_batch_size=args.batch_size)
+ edit_batch_size=args.batch_size,
+ )
if args.kafka_mode:
KafkaJsonPusher(
iwri,
@@ -201,13 +213,16 @@ def run_ingest_web(args):
else:
JsonLinePusher(iwri, args.json_file).run()
+
def run_ingest_fileset(args):
- ifri = IngestFilesetResultImporter(args.api,
+ ifri = IngestFilesetResultImporter(
+ args.api,
editgroup_description=args.editgroup_description_override,
skip_source_allowlist=args.skip_source_allowlist,
do_updates=args.do_updates,
default_link_rel=args.default_link_rel,
- edit_batch_size=args.batch_size)
+ edit_batch_size=args.batch_size,
+ )
if args.kafka_mode:
KafkaJsonPusher(
ifri,
@@ -222,10 +237,13 @@ def run_ingest_fileset(args):
else:
JsonLinePusher(ifri, args.json_file).run()
+
def run_savepapernow_file(args):
- ifri = SavePaperNowFileImporter(args.api,
+ ifri = SavePaperNowFileImporter(
+ args.api,
editgroup_description=args.editgroup_description_override,
- edit_batch_size=args.batch_size)
+ edit_batch_size=args.batch_size,
+ )
if args.kafka_mode:
KafkaJsonPusher(
ifri,
@@ -240,10 +258,13 @@ def run_savepapernow_file(args):
else:
JsonLinePusher(ifri, args.json_file).run()
+
def run_savepapernow_web(args):
- ifri = SavePaperNowWebImporter(args.api,
+ ifri = SavePaperNowWebImporter(
+ args.api,
editgroup_description=args.editgroup_description_override,
- edit_batch_size=args.batch_size)
+ edit_batch_size=args.batch_size,
+ )
if args.kafka_mode:
KafkaJsonPusher(
ifri,
@@ -258,10 +279,13 @@ def run_savepapernow_web(args):
else:
JsonLinePusher(ifri, args.json_file).run()
+
def run_savepapernow_fileset(args):
- ifri = SavePaperNowFilesetImporter(args.api,
+ ifri = SavePaperNowFilesetImporter(
+ args.api,
editgroup_description=args.editgroup_description_override,
- edit_batch_size=args.batch_size)
+ edit_batch_size=args.batch_size,
+ )
if args.kafka_mode:
KafkaJsonPusher(
ifri,
@@ -276,18 +300,22 @@ def run_savepapernow_fileset(args):
else:
JsonLinePusher(ifri, args.json_file).run()
+
def run_grobid_metadata(args):
- fmi = GrobidMetadataImporter(args.api,
+ fmi = GrobidMetadataImporter(
+ args.api,
edit_batch_size=args.batch_size,
longtail_oa=args.longtail_oa,
- bezerk_mode=args.bezerk_mode)
+ bezerk_mode=args.bezerk_mode,
+ )
LinePusher(fmi, args.tsv_file).run()
+
def run_shadow_lib(args):
- fmi = ShadowLibraryImporter(args.api,
- edit_batch_size=100)
+ fmi = ShadowLibraryImporter(args.api, edit_batch_size=100)
JsonLinePusher(fmi, args.json_file).run()
+
def run_wayback_static(args):
api = args.api
@@ -295,8 +323,8 @@ def run_wayback_static(args):
if args.release_id:
release_id = args.release_id
elif args.extid:
- idtype = args.extid.split(':')[0]
- extid = ':'.join(args.extid.split(':')[1:])
+ idtype = args.extid.split(":")[0]
+ extid = ":".join(args.extid.split(":")[1:])
if idtype == "doi":
release_id = api.lookup_release(doi=extid).ident
elif idtype == "pmid":
@@ -309,8 +337,9 @@ def run_wayback_static(args):
raise Exception("need either release_id or extid argument")
# create it
- (editgroup_id, wc) = auto_wayback_static(api, release_id, args.wayback_url,
- editgroup_id=args.editgroup_id)
+ (editgroup_id, wc) = auto_wayback_static(
+ api, release_id, args.wayback_url, editgroup_id=args.editgroup_id
+ )
if not wc:
return
print("release_id: {}".format(release_id))
@@ -318,12 +347,14 @@ def run_wayback_static(args):
print("webcapture id: {}".format(wc.ident))
print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident))
+
def run_cdl_dash_dat(args):
api = args.api
# create it
- (editgroup_id, release, fs) = auto_cdl_dash_dat(api, args.dat_path,
- release_id=args.release_id, editgroup_id=args.editgroup_id)
+ (editgroup_id, release, fs) = auto_cdl_dash_dat(
+ api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id
+ )
if not fs:
return
print("release_id: {}".format(release.ident))
@@ -331,14 +362,17 @@ def run_cdl_dash_dat(args):
print("fileset id: {}".format(fs.ident))
print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))
+
def run_datacite(args):
- dci = DataciteImporter(args.api,
+ dci = DataciteImporter(
+ args.api,
args.issn_map_file,
edit_batch_size=args.batch_size,
bezerk_mode=args.bezerk_mode,
debug=args.debug,
extid_map_file=args.extid_map_file,
- insert_log_file=args.insert_log_file)
+ insert_log_file=args.insert_log_file,
+ )
if args.kafka_mode:
KafkaJsonPusher(
dci,
@@ -351,8 +385,10 @@ def run_datacite(args):
else:
JsonLinePusher(dci, args.json_file).run()
+
def run_doaj_article(args):
- dai = DoajArticleImporter(args.api,
+ dai = DoajArticleImporter(
+ args.api,
args.issn_map_file,
edit_batch_size=args.batch_size,
do_updates=args.do_updates,
@@ -369,8 +405,10 @@ def run_doaj_article(args):
else:
JsonLinePusher(dai, args.json_file).run()
+
def run_dblp_release(args):
- dri = DblpReleaseImporter(args.api,
+ dri = DblpReleaseImporter(
+ args.api,
dblp_container_map_file=args.dblp_container_map_file,
edit_batch_size=args.batch_size,
do_updates=args.do_updates,
@@ -383,8 +421,10 @@ def run_dblp_release(args):
use_lxml=True,
).run()
+
def run_dblp_container(args):
- dci = DblpContainerImporter(args.api,
+ dci = DblpContainerImporter(
+ args.api,
args.issn_map_file,
dblp_container_map_file=args.dblp_container_map_file,
dblp_container_map_output=args.dblp_container_map_output,
@@ -393,14 +433,17 @@ def run_dblp_container(args):
)
JsonLinePusher(dci, args.json_file).run()
+
def run_file_meta(args):
# do_updates defaults to true for this importer
- fmi = FileMetaImporter(args.api,
+ fmi = FileMetaImporter(
+ args.api,
edit_batch_size=100,
editgroup_description=args.editgroup_description_override,
)
JsonLinePusher(fmi, args.json_file).run()
+
def run_fileset(args):
fmi = FilesetImporter(
args.api,
@@ -409,478 +452,664 @@ def run_fileset(args):
)
JsonLinePusher(fmi, args.json_file).run()
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--host-url',
- default="http://localhost:9411/v0",
- help="connect to this host/port")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa)")
- parser.add_argument('--batch-size',
- help="size of batch to send",
- default=50, type=int)
- parser.add_argument('--editgroup-description-override',
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa)"
+ )
+ parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+ parser.add_argument(
+ "--editgroup-description-override",
help="editgroup description override",
- default=None, type=str)
+ default=None,
+ type=str,
+ )
subparsers = parser.add_subparsers()
- sub_crossref = subparsers.add_parser('crossref',
- help="import Crossref API metadata format (JSON)")
+ sub_crossref = subparsers.add_parser(
+ "crossref", help="import Crossref API metadata format (JSON)"
+ )
sub_crossref.set_defaults(
func=run_crossref,
auth_var="FATCAT_AUTH_WORKER_CROSSREF",
)
- sub_crossref.add_argument('json_file',
+ sub_crossref.add_argument(
+ "json_file",
help="crossref JSON file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_crossref.add_argument('issn_map_file',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_crossref.add_argument(
+ "issn_map_file",
help="ISSN to ISSN-L mapping file",
- default=None, type=argparse.FileType('r'))
- sub_crossref.add_argument('--extid-map-file',
+ default=None,
+ type=argparse.FileType("r"),
+ )
+ sub_crossref.add_argument(
+ "--extid-map-file",
help="DOI-to-other-identifiers sqlite3 database",
- default=None, type=str)
- sub_crossref.add_argument('--no-lookup-refs',
- action='store_true',
- help="skip lookup of references (PMID or DOI)")
- sub_crossref.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
- sub_crossref.add_argument('--bezerk-mode',
- action='store_true',
- help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
-
- sub_jalc = subparsers.add_parser('jalc',
- help="import JALC DOI metadata from XML dump")
+ default=None,
+ type=str,
+ )
+ sub_crossref.add_argument(
+ "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)"
+ )
+ sub_crossref.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
+ sub_crossref.add_argument(
+ "--bezerk-mode",
+ action="store_true",
+ help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)",
+ )
+
+ sub_jalc = subparsers.add_parser("jalc", help="import JALC DOI metadata from XML dump")
sub_jalc.set_defaults(
func=run_jalc,
auth_var="FATCAT_AUTH_WORKER_JALC",
)
- sub_jalc.add_argument('xml_file',
+ sub_jalc.add_argument(
+ "xml_file",
help="Jalc RDF XML file (record-per-line) to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_jalc.add_argument('issn_map_file',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_jalc.add_argument(
+ "issn_map_file",
help="ISSN to ISSN-L mapping file",
- default=None, type=argparse.FileType('r'))
- sub_jalc.add_argument('--extid-map-file',
+ default=None,
+ type=argparse.FileType("r"),
+ )
+ sub_jalc.add_argument(
+ "--extid-map-file",
help="DOI-to-other-identifiers sqlite3 database",
- default=None, type=str)
+ default=None,
+ type=str,
+ )
- sub_arxiv = subparsers.add_parser('arxiv',
- help="import arxiv.org metadata from XML files")
+ sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files")
sub_arxiv.set_defaults(
func=run_arxiv,
auth_var="FATCAT_AUTH_WORKER_ARXIV",
)
- sub_arxiv.add_argument('xml_file',
- nargs='?',
+ sub_arxiv.add_argument(
+ "xml_file",
+ nargs="?",
help="arXivRaw XML file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_arxiv.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_arxiv.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
- sub_pubmed = subparsers.add_parser('pubmed',
- help="import MEDLINE/PubMed work-level metadata (XML)")
+ sub_pubmed = subparsers.add_parser(
+ "pubmed", help="import MEDLINE/PubMed work-level metadata (XML)"
+ )
sub_pubmed.set_defaults(
func=run_pubmed,
auth_var="FATCAT_AUTH_WORKER_PUBMED",
)
- sub_pubmed.add_argument('xml_file',
- nargs='?',
+ sub_pubmed.add_argument(
+ "xml_file",
+ nargs="?",
help="Pubmed XML file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_pubmed.add_argument('issn_map_file',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_pubmed.add_argument(
+ "issn_map_file",
help="ISSN to ISSN-L mapping file",
- default=None, type=argparse.FileType('r'))
- sub_pubmed.add_argument('--no-lookup-refs',
- action='store_true',
- help="skip lookup of references (PMID or DOI)")
- sub_pubmed.add_argument('--do-updates',
- action='store_true',
- help="update pre-existing release entities")
- sub_pubmed.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
-
- sub_jstor = subparsers.add_parser('jstor',
- help="import JSTOR work-level metadata from XML dump")
+ default=None,
+ type=argparse.FileType("r"),
+ )
+ sub_pubmed.add_argument(
+ "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)"
+ )
+ sub_pubmed.add_argument(
+ "--do-updates", action="store_true", help="update pre-existing release entities"
+ )
+ sub_pubmed.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
+
+ sub_jstor = subparsers.add_parser(
+ "jstor", help="import JSTOR work-level metadata from XML dump"
+ )
sub_jstor.set_defaults(
func=run_jstor,
auth_var="FATCAT_AUTH_WORKER_JSTOR",
)
- sub_jstor.add_argument('list_file',
+ sub_jstor.add_argument(
+ "list_file",
help="List of JSTOR XML file paths to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_jstor.add_argument('issn_map_file',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_jstor.add_argument(
+ "issn_map_file",
help="ISSN to ISSN-L mapping file",
- default=None, type=argparse.FileType('r'))
+ default=None,
+ type=argparse.FileType("r"),
+ )
- sub_orcid = subparsers.add_parser('orcid',
- help="import creator entities from ORCID XML dump")
- sub_orcid.set_defaults(
- func=run_orcid,
- auth_var="FATCAT_AUTH_WORKER_ORCID"
+ sub_orcid = subparsers.add_parser(
+ "orcid", help="import creator entities from ORCID XML dump"
)
- sub_orcid.add_argument('json_file',
+ sub_orcid.set_defaults(func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID")
+ sub_orcid.add_argument(
+ "json_file",
help="orcid JSON file to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
- sub_journal_metadata = subparsers.add_parser('journal-metadata',
- help="import/update container metadata from old manual munging format")
+ sub_journal_metadata = subparsers.add_parser(
+ "journal-metadata",
+ help="import/update container metadata from old manual munging format",
+ )
sub_journal_metadata.set_defaults(
func=run_journal_metadata,
auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
)
- sub_journal_metadata.add_argument('json_file',
+ sub_journal_metadata.add_argument(
+ "json_file",
help="Journal JSON metadata file to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
- sub_chocula = subparsers.add_parser('chocula',
- help="import/update container metadata from chocula JSON export")
+ sub_chocula = subparsers.add_parser(
+ "chocula", help="import/update container metadata from chocula JSON export"
+ )
sub_chocula.set_defaults(
func=run_chocula,
auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
)
- sub_chocula.add_argument('json_file',
+ sub_chocula.add_argument(
+ "json_file",
help="chocula JSON entities file (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_chocula.add_argument('--do-updates',
- action='store_true',
- help="update pre-existing container entities")
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_chocula.add_argument(
+ "--do-updates", action="store_true", help="update pre-existing container entities"
+ )
- sub_matched = subparsers.add_parser('matched',
- help="add file entities matched against existing releases; custom JSON format")
+ sub_matched = subparsers.add_parser(
+ "matched",
+ help="add file entities matched against existing releases; custom JSON format",
+ )
sub_matched.set_defaults(
func=run_matched,
auth_var="FATCAT_API_AUTH_TOKEN",
)
- sub_matched.add_argument('json_file',
+ sub_matched.add_argument(
+ "json_file",
help="JSON file to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_matched.add_argument('--default-mimetype',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_matched.add_argument(
+ "--default-mimetype",
default=None,
- help="default mimetype for imported files (if not specified per-file)")
- sub_matched.add_argument('--bezerk-mode',
- action='store_true',
- help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
- sub_matched.add_argument('--default-link-rel',
+ help="default mimetype for imported files (if not specified per-file)",
+ )
+ sub_matched.add_argument(
+ "--bezerk-mode",
+ action="store_true",
+ help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)",
+ )
+ sub_matched.add_argument(
+ "--default-link-rel",
default="web",
- help="default URL rel for matches (eg, 'publisher', 'web')")
+ help="default URL rel for matches (eg, 'publisher', 'web')",
+ )
- sub_arabesque_match = subparsers.add_parser('arabesque',
- help="add file entities matched to releases from crawl log analysis")
+ sub_arabesque_match = subparsers.add_parser(
+ "arabesque", help="add file entities matched to releases from crawl log analysis"
+ )
sub_arabesque_match.set_defaults(
func=run_arabesque_match,
auth_var="FATCAT_AUTH_WORKER_CRAWL",
)
- sub_arabesque_match.add_argument('--sqlite-file',
- help="sqlite database file to import from")
- sub_arabesque_match.add_argument('--json-file',
- help="JSON file to import from (or stdin)",
- type=argparse.FileType('r'))
- sub_arabesque_match.add_argument('--do-updates',
- action='store_true',
- help="update pre-existing file entities if new match (instead of skipping)")
- sub_arabesque_match.add_argument('--no-require-grobid',
- action='store_true',
- help="whether postproc_status column must be '200'")
- sub_arabesque_match.add_argument('--extid-type',
+ sub_arabesque_match.add_argument(
+ "--sqlite-file", help="sqlite database file to import from"
+ )
+ sub_arabesque_match.add_argument(
+ "--json-file", help="JSON file to import from (or stdin)", type=argparse.FileType("r")
+ )
+ sub_arabesque_match.add_argument(
+ "--do-updates",
+ action="store_true",
+ help="update pre-existing file entities if new match (instead of skipping)",
+ )
+ sub_arabesque_match.add_argument(
+ "--no-require-grobid",
+ action="store_true",
+ help="whether postproc_status column must be '200'",
+ )
+ sub_arabesque_match.add_argument(
+ "--extid-type",
default="doi",
- help="identifier type in the database (eg, 'doi', 'pmcid'")
- sub_arabesque_match.add_argument('--crawl-id',
- help="crawl ID (optionally included in editgroup metadata)")
- sub_arabesque_match.add_argument('--default-link-rel',
+ help="identifier type in the database (eg, 'doi', 'pmcid'",
+ )
+ sub_arabesque_match.add_argument(
+ "--crawl-id", help="crawl ID (optionally included in editgroup metadata)"
+ )
+ sub_arabesque_match.add_argument(
+ "--default-link-rel",
default="web",
- help="default URL rel for matches (eg, 'publisher', 'web')")
+ help="default URL rel for matches (eg, 'publisher', 'web')",
+ )
- sub_ingest_file = subparsers.add_parser('ingest-file-results',
- help="add/update file entities linked to releases based on sandcrawler ingest results")
+ sub_ingest_file = subparsers.add_parser(
+ "ingest-file-results",
+ help="add/update file entities linked to releases based on sandcrawler ingest results",
+ )
sub_ingest_file.set_defaults(
func=run_ingest_file,
auth_var="FATCAT_AUTH_WORKER_CRAWL",
)
- sub_ingest_file.add_argument('json_file',
+ sub_ingest_file.add_argument(
+ "json_file",
help="ingest_file JSON file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_ingest_file.add_argument('--skip-source-allowlist',
- action='store_true',
- help="don't filter import based on request source allowlist")
- sub_ingest_file.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
- sub_ingest_file.add_argument('--do-updates',
- action='store_true',
- help="update pre-existing file entities if new match (instead of skipping)")
- sub_ingest_file.add_argument('--no-require-grobid',
- action='store_true',
- help="whether postproc_status column must be '200'")
- sub_ingest_file.add_argument('--default-link-rel',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_ingest_file.add_argument(
+ "--skip-source-allowlist",
+ action="store_true",
+ help="don't filter import based on request source allowlist",
+ )
+ sub_ingest_file.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
+ sub_ingest_file.add_argument(
+ "--do-updates",
+ action="store_true",
+ help="update pre-existing file entities if new match (instead of skipping)",
+ )
+ sub_ingest_file.add_argument(
+ "--no-require-grobid",
+ action="store_true",
+ help="whether postproc_status column must be '200'",
+ )
+ sub_ingest_file.add_argument(
+ "--default-link-rel",
default="web",
- help="default URL rel for matches (eg, 'publisher', 'web')")
+ help="default URL rel for matches (eg, 'publisher', 'web')",
+ )
- sub_ingest_web = subparsers.add_parser('ingest-web-results',
- help="add/update web entities linked to releases based on sandcrawler ingest results")
+ sub_ingest_web = subparsers.add_parser(
+ "ingest-web-results",
+ help="add/update web entities linked to releases based on sandcrawler ingest results",
+ )
sub_ingest_web.set_defaults(
func=run_ingest_web,
auth_var="FATCAT_AUTH_WORKER_CRAWL",
)
- sub_ingest_web.add_argument('json_file',
+ sub_ingest_web.add_argument(
+ "json_file",
help="ingest_web JSON file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_ingest_web.add_argument('--skip-source-allowlist',
- action='store_true',
- help="don't filter import based on request source allowlist")
- sub_ingest_web.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
- sub_ingest_web.add_argument('--do-updates',
- action='store_true',
- help="update pre-existing web entities if new match (instead of skipping)")
- sub_ingest_web.add_argument('--default-link-rel',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_ingest_web.add_argument(
+ "--skip-source-allowlist",
+ action="store_true",
+ help="don't filter import based on request source allowlist",
+ )
+ sub_ingest_web.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
+ sub_ingest_web.add_argument(
+ "--do-updates",
+ action="store_true",
+ help="update pre-existing web entities if new match (instead of skipping)",
+ )
+ sub_ingest_web.add_argument(
+ "--default-link-rel",
default="web",
- help="default URL rel for matches (eg, 'publisher', 'web')")
+ help="default URL rel for matches (eg, 'publisher', 'web')",
+ )
- sub_ingest_fileset = subparsers.add_parser('ingest-fileset-results',
- help="add/update fileset entities linked to releases based on sandcrawler ingest results")
+ sub_ingest_fileset = subparsers.add_parser(
+ "ingest-fileset-results",
+ help="add/update fileset entities linked to releases based on sandcrawler ingest results",
+ )
sub_ingest_fileset.set_defaults(
func=run_ingest_fileset,
auth_var="FATCAT_AUTH_WORKER_CRAWL",
)
- sub_ingest_fileset.add_argument('json_file',
+ sub_ingest_fileset.add_argument(
+ "json_file",
help="ingest_fileset JSON file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_ingest_fileset.add_argument('--skip-source-allowlist',
- action='store_true',
- help="don't filter import based on request source allowlist")
- sub_ingest_fileset.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
- sub_ingest_fileset.add_argument('--do-updates',
- action='store_true',
- help="update pre-existing fileset entities if new match (instead of skipping)")
- sub_ingest_fileset.add_argument('--default-link-rel',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_ingest_fileset.add_argument(
+ "--skip-source-allowlist",
+ action="store_true",
+ help="don't filter import based on request source allowlist",
+ )
+ sub_ingest_fileset.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
+ sub_ingest_fileset.add_argument(
+ "--do-updates",
+ action="store_true",
+ help="update pre-existing fileset entities if new match (instead of skipping)",
+ )
+ sub_ingest_fileset.add_argument(
+ "--default-link-rel",
default="fileset",
- help="default URL rel for matches (eg, 'publisher', 'web')")
+ help="default URL rel for matches (eg, 'publisher', 'web')",
+ )
- sub_savepapernow_file = subparsers.add_parser('savepapernow-file-results',
- help="add file entities crawled due to async Save Paper Now request")
+ sub_savepapernow_file = subparsers.add_parser(
+ "savepapernow-file-results",
+ help="add file entities crawled due to async Save Paper Now request",
+ )
sub_savepapernow_file.set_defaults(
func=run_savepapernow_file,
auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW",
)
- sub_savepapernow_file.add_argument('json_file',
+ sub_savepapernow_file.add_argument(
+ "json_file",
help="ingest-file JSON file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_savepapernow_file.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_savepapernow_file.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
- sub_savepapernow_web = subparsers.add_parser('savepapernow-web-results',
- help="add webcapture entities crawled due to async Save Paper Now request")
+ sub_savepapernow_web = subparsers.add_parser(
+ "savepapernow-web-results",
+ help="add webcapture entities crawled due to async Save Paper Now request",
+ )
sub_savepapernow_web.set_defaults(
func=run_savepapernow_web,
auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW",
)
- sub_savepapernow_web.add_argument('json_file',
+ sub_savepapernow_web.add_argument(
+ "json_file",
help="ingest-file JSON file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_savepapernow_web.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_savepapernow_web.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
- sub_savepapernow_fileset = subparsers.add_parser('savepapernow-fileset-results',
- help="add fileset entities crawled due to async Save Paper Now request")
+ sub_savepapernow_fileset = subparsers.add_parser(
+ "savepapernow-fileset-results",
+ help="add fileset entities crawled due to async Save Paper Now request",
+ )
sub_savepapernow_fileset.set_defaults(
func=run_savepapernow_fileset,
auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW",
)
- sub_savepapernow_fileset.add_argument('json_file',
+ sub_savepapernow_fileset.add_argument(
+ "json_file",
help="ingest-file JSON file to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_savepapernow_fileset.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_savepapernow_fileset.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
- sub_grobid_metadata = subparsers.add_parser('grobid-metadata',
- help="create release and file entities based on GROBID PDF metadata extraction")
+ sub_grobid_metadata = subparsers.add_parser(
+ "grobid-metadata",
+ help="create release and file entities based on GROBID PDF metadata extraction",
+ )
sub_grobid_metadata.set_defaults(
func=run_grobid_metadata,
auth_var="FATCAT_API_AUTH_TOKEN",
)
- sub_grobid_metadata.add_argument('tsv_file',
+ sub_grobid_metadata.add_argument(
+ "tsv_file",
help="TSV file to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_grobid_metadata.add_argument('--group-size',
- help="editgroup group size to use",
- default=75, type=int)
- sub_grobid_metadata.add_argument('--longtail-oa',
- action='store_true',
- help="if this is an import of longtail OA content (sets an 'extra' flag)")
- sub_grobid_metadata.add_argument('--bezerk-mode',
- action='store_true',
- help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
-
- sub_shadow_lib = subparsers.add_parser('shadow-lib',
- help="create release and file entities based on GROBID PDF metadata extraction")
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_grobid_metadata.add_argument(
+ "--group-size", help="editgroup group size to use", default=75, type=int
+ )
+ sub_grobid_metadata.add_argument(
+ "--longtail-oa",
+ action="store_true",
+ help="if this is an import of longtail OA content (sets an 'extra' flag)",
+ )
+ sub_grobid_metadata.add_argument(
+ "--bezerk-mode",
+ action="store_true",
+ help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)",
+ )
+
+ sub_shadow_lib = subparsers.add_parser(
+ "shadow-lib",
+ help="create release and file entities based on GROBID PDF metadata extraction",
+ )
sub_shadow_lib.set_defaults(
func=run_shadow_lib,
auth_var="FATCAT_AUTH_WORKER_SHADOW",
)
- sub_shadow_lib.add_argument('json_file',
+ sub_shadow_lib.add_argument(
+ "json_file",
help="JSON file to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
- sub_wayback_static = subparsers.add_parser('wayback-static',
- help="crude crawl+ingest tool for single-page HTML docs from wayback")
+ sub_wayback_static = subparsers.add_parser(
+ "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback"
+ )
sub_wayback_static.set_defaults(
func=run_wayback_static,
auth_var="FATCAT_API_AUTH_TOKEN",
)
- sub_wayback_static.add_argument('wayback_url',
- type=str,
- help="URL of wayback capture to extract from")
- sub_wayback_static.add_argument('--extid',
- type=str,
- help="external identifier for release lookup")
- sub_wayback_static.add_argument('--release-id',
- type=str,
- help="release entity identifier")
- sub_wayback_static.add_argument('--editgroup-id',
+ sub_wayback_static.add_argument(
+ "wayback_url", type=str, help="URL of wayback capture to extract from"
+ )
+ sub_wayback_static.add_argument(
+ "--extid", type=str, help="external identifier for release lookup"
+ )
+ sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier")
+ sub_wayback_static.add_argument(
+ "--editgroup-id",
type=str,
- help="use existing editgroup (instead of creating a new one)")
+ help="use existing editgroup (instead of creating a new one)",
+ )
- sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat',
- help="crude helper to import datasets from Dat/CDL mirror pilot project")
+ sub_cdl_dash_dat = subparsers.add_parser(
+ "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project"
+ )
sub_cdl_dash_dat.set_defaults(
func=run_cdl_dash_dat,
auth_var="FATCAT_API_AUTH_TOKEN",
)
- sub_cdl_dash_dat.add_argument('dat_path',
- type=str,
- help="local path dat to import (must be the dat discovery key)")
- sub_cdl_dash_dat.add_argument('--release-id',
- type=str,
- help="release entity identifier")
- sub_cdl_dash_dat.add_argument('--editgroup-id',
+ sub_cdl_dash_dat.add_argument(
+ "dat_path", type=str, help="local path dat to import (must be the dat discovery key)"
+ )
+ sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier")
+ sub_cdl_dash_dat.add_argument(
+ "--editgroup-id",
type=str,
- help="use existing editgroup (instead of creating a new one)")
+ help="use existing editgroup (instead of creating a new one)",
+ )
- sub_datacite = subparsers.add_parser('datacite',
- help="import datacite.org metadata")
- sub_datacite.add_argument('json_file',
+ sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata")
+ sub_datacite.add_argument(
+ "json_file",
help="File with jsonlines from datacite.org v2 API to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_datacite.add_argument('issn_map_file',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_datacite.add_argument(
+ "issn_map_file",
help="ISSN to ISSN-L mapping file",
- default=None, type=argparse.FileType('r'))
- sub_datacite.add_argument('--extid-map-file',
+ default=None,
+ type=argparse.FileType("r"),
+ )
+ sub_datacite.add_argument(
+ "--extid-map-file",
help="DOI-to-other-identifiers sqlite3 database",
- default=None, type=str)
- sub_datacite.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
- sub_datacite.add_argument('--bezerk-mode',
- action='store_true',
- help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
- sub_datacite.add_argument('--debug',
- action='store_true',
- help="write converted JSON to stdout")
- sub_datacite.add_argument('--insert-log-file',
- default='',
+ default=None,
type=str,
- help="write inserted documents into file (for debugging)")
+ )
+ sub_datacite.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
+ sub_datacite.add_argument(
+ "--bezerk-mode",
+ action="store_true",
+ help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)",
+ )
+ sub_datacite.add_argument(
+ "--debug", action="store_true", help="write converted JSON to stdout"
+ )
+ sub_datacite.add_argument(
+ "--insert-log-file",
+ default="",
+ type=str,
+ help="write inserted documents into file (for debugging)",
+ )
sub_datacite.set_defaults(
func=run_datacite,
auth_var="FATCAT_AUTH_WORKER_DATACITE",
)
- sub_doaj_article = subparsers.add_parser('doaj-article',
- help="import doaj.org article metadata")
- sub_doaj_article.add_argument('json_file',
+ sub_doaj_article = subparsers.add_parser(
+ "doaj-article", help="import doaj.org article metadata"
+ )
+ sub_doaj_article.add_argument(
+ "json_file",
help="File with JSON lines from DOAJ API (or bulk dump) to import from",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_doaj_article.add_argument('--issn-map-file',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_doaj_article.add_argument(
+ "--issn-map-file",
help="ISSN to ISSN-L mapping file",
- default=None, type=argparse.FileType('r'))
- sub_doaj_article.add_argument('--kafka-mode',
- action='store_true',
- help="consume from kafka topic (not stdin)")
- sub_doaj_article.add_argument('--do-updates',
- action='store_true',
- help="update any pre-existing release entities")
+ default=None,
+ type=argparse.FileType("r"),
+ )
+ sub_doaj_article.add_argument(
+ "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)"
+ )
+ sub_doaj_article.add_argument(
+ "--do-updates", action="store_true", help="update any pre-existing release entities"
+ )
sub_doaj_article.set_defaults(
func=run_doaj_article,
auth_var="FATCAT_AUTH_WORKER_DOAJ",
)
- sub_dblp_release = subparsers.add_parser('dblp-release',
- help="import dblp release metadata")
- sub_dblp_release.add_argument('xml_file',
+ sub_dblp_release = subparsers.add_parser(
+ "dblp-release", help="import dblp release metadata"
+ )
+ sub_dblp_release.add_argument(
+ "xml_file",
help="File with DBLP XML to import from",
- default=sys.stdin, type=argparse.FileType('rb'))
- sub_dblp_release.add_argument('--dblp-container-map-file',
+ default=sys.stdin,
+ type=argparse.FileType("rb"),
+ )
+ sub_dblp_release.add_argument(
+ "--dblp-container-map-file",
help="file path to dblp prefix to container_id TSV file",
- default=None, type=argparse.FileType('r'))
- sub_dblp_release.add_argument('--do-updates',
- action='store_true',
- help="update any pre-existing release entities")
- sub_dblp_release.add_argument('--dump-json-mode',
- action='store_true',
- help="print release entities to stdout instead of importing")
+ default=None,
+ type=argparse.FileType("r"),
+ )
+ sub_dblp_release.add_argument(
+ "--do-updates", action="store_true", help="update any pre-existing release entities"
+ )
+ sub_dblp_release.add_argument(
+ "--dump-json-mode",
+ action="store_true",
+ help="print release entities to stdout instead of importing",
+ )
sub_dblp_release.set_defaults(
func=run_dblp_release,
auth_var="FATCAT_AUTH_WORKER_DBLP",
)
- sub_dblp_container = subparsers.add_parser('dblp-container',
- help="import dblp container metadata")
- sub_dblp_container.add_argument('json_file',
+ sub_dblp_container = subparsers.add_parser(
+ "dblp-container", help="import dblp container metadata"
+ )
+ sub_dblp_container.add_argument(
+ "json_file",
help="File with DBLP container JSON to import from (see extra/dblp/)",
- default=sys.stdin, type=argparse.FileType('rb'))
- sub_dblp_container.add_argument('--dblp-container-map-file',
+ default=sys.stdin,
+ type=argparse.FileType("rb"),
+ )
+ sub_dblp_container.add_argument(
+ "--dblp-container-map-file",
help="file path to dblp pre-existing prefix to container_id TSV file",
- default=None, type=argparse.FileType('r'))
- sub_dblp_container.add_argument('--dblp-container-map-output',
+ default=None,
+ type=argparse.FileType("r"),
+ )
+ sub_dblp_container.add_argument(
+ "--dblp-container-map-output",
help="file path to output new dblp container map TSV to",
- default=None, type=argparse.FileType('w'))
- sub_dblp_container.add_argument('--issn-map-file',
+ default=None,
+ type=argparse.FileType("w"),
+ )
+ sub_dblp_container.add_argument(
+ "--issn-map-file",
help="ISSN to ISSN-L mapping file",
- default=None, type=argparse.FileType('r'))
- sub_dblp_container.add_argument('--do-updates',
- action='store_true',
- help="update any pre-existing container entities")
+ default=None,
+ type=argparse.FileType("r"),
+ )
+ sub_dblp_container.add_argument(
+ "--do-updates", action="store_true", help="update any pre-existing container entities"
+ )
sub_dblp_container.set_defaults(
func=run_dblp_container,
auth_var="FATCAT_AUTH_WORKER_DBLP",
)
- sub_file_meta = subparsers.add_parser('file-meta',
- help="simple update-only importer for file metadata")
+ sub_file_meta = subparsers.add_parser(
+ "file-meta", help="simple update-only importer for file metadata"
+ )
sub_file_meta.set_defaults(
func=run_file_meta,
auth_var="FATCAT_API_AUTH_TOKEN",
)
- sub_file_meta.add_argument('json_file',
+ sub_file_meta.add_argument(
+ "json_file",
help="File with jsonlines from file_meta schema to import from",
- default=sys.stdin, type=argparse.FileType('r'))
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
- sub_fileset = subparsers.add_parser('fileset',
- help="generic fileset importer")
+ sub_fileset = subparsers.add_parser("fileset", help="generic fileset importer")
sub_fileset.set_defaults(
func=run_fileset,
auth_var="FATCAT_API_AUTH_TOKEN",
)
- sub_fileset.add_argument('json_file',
+ sub_fileset.add_argument(
+ "json_file",
help="File with jsonlines of fileset entities to import",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_fileset.add_argument('--skip-release-fileset-check',
- action='store_true',
- help="create without checking if releases already have related filesets")
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_fileset.add_argument(
+ "--skip-release-fileset-check",
+ action="store_true",
+ help="create without checking if releases already have related filesets",
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -889,15 +1118,18 @@ def main():
# allow editgroup description override via env variable (but CLI arg takes
# precedence)
- if not args.editgroup_description_override \
- and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'):
- args.editgroup_description_override = os.environ.get('FATCAT_EDITGROUP_DESCRIPTION')
+ if not args.editgroup_description_override and os.environ.get(
+ "FATCAT_EDITGROUP_DESCRIPTION"
+ ):
+ args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION")
args.api = authenticated_api(
args.host_url,
# token is an optional kwarg (can be empty string, None, etc)
- token=os.environ.get(args.auth_var))
+ token=os.environ.get(args.auth_var),
+ )
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 165e42f3..21597fae 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -42,51 +42,56 @@ def _run_search_dump(args, search):
else:
ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env)
if args.enqueue_kafka:
- print("Will send ingest requests to kafka topic: {}".format(ingest_file_request_topic), file=sys.stderr)
+ print(
+ "Will send ingest requests to kafka topic: {}".format(ingest_file_request_topic),
+ file=sys.stderr,
+ )
kafka_producer = simple_kafka_producer(args.kafka_hosts)
if args.limit is not None:
- search = search[:args.limit]
+ search = search[: args.limit]
if args.before_year:
- search = search \
- .filter("exists", field="release_year") \
- .filter("range", release_date=dict(lt=args.before_year))
+ search = search.filter("exists", field="release_year").filter(
+ "range", release_date=dict(lt=args.before_year)
+ )
if args.after_year:
- search = search \
- .filter("exists", field="release_year") \
- .filter("range", release_date=dict(gte=args.after_year))
+ search = search.filter("exists", field="release_year").filter(
+ "range", release_date=dict(gte=args.after_year)
+ )
if not args.allow_non_oa:
search = search.filter("term", is_oa=True)
if args.release_types:
- release_types = args.release_types.split(',')
- search = search \
- .filter("terms", release_type=release_types)
+ release_types = args.release_types.split(",")
+ search = search.filter("terms", release_type=release_types)
else:
- search = search \
- .filter("bool", must_not=[
- Q("terms", release_type=["stub", "component"])
- ])
+ search = search.filter(
+ "bool", must_not=[Q("terms", release_type=["stub", "component"])]
+ )
- counts = Counter({'ingest_request': 0, 'elasticsearch_release': 0, 'estimate': 0})
+ counts = Counter({"ingest_request": 0, "elasticsearch_release": 0, "estimate": 0})
search = search.params()
- counts['estimate'] = search.count()
- print("Expecting {} release objects in search queries".format(counts['estimate']), file=sys.stderr)
+ counts["estimate"] = search.count()
+ print(
+ "Expecting {} release objects in search queries".format(counts["estimate"]),
+ file=sys.stderr,
+ )
# don't try to clean up scroll if we are connected to public server (behind
# nginx proxy that disallows DELETE)
if args.elasticsearch_endpoint in (
- 'https://search.fatcat.wiki',
- 'https://search.qa.fatcat.wiki'):
+ "https://search.fatcat.wiki",
+ "https://search.qa.fatcat.wiki",
+ ):
search = search.params(clear_scroll=False)
results = search.scan()
for esr in results:
- if args.limit and counts['ingest_request'] >= args.limit:
+ if args.limit and counts["ingest_request"] >= args.limit:
break
- counts['elasticsearch_release'] += 1
+ counts["elasticsearch_release"] += 1
release = args.api.get_release(esr.ident)
ingest_request = release_ingest_request(
release,
@@ -96,18 +101,18 @@ def _run_search_dump(args, search):
if not ingest_request:
continue
if args.force_recrawl:
- ingest_request['force_recrawl'] = True
- counts['ingest_request'] += 1
+ ingest_request["force_recrawl"] = True
+ counts["ingest_request"] += 1
if args.dry_run:
continue
if kafka_producer is not None:
kafka_producer.produce(
ingest_file_request_topic,
- json.dumps(ingest_request).encode('utf-8'),
- #key=None,
+ json.dumps(ingest_request).encode("utf-8"),
+ # key=None,
on_delivery=kafka_fail_fast,
)
- counts['kafka'] += 1
+ counts["kafka"] += 1
else:
print(json.dumps(ingest_request))
if kafka_producer is not None:
@@ -138,7 +143,9 @@ def run_ingest_container(args):
elif args.name:
search = search.query("match", container_name=args.name)
else:
- print("You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr)
+ print(
+ "You must supply at least one query/filter parameter! Eg, ISSN-L", file=sys.stderr
+ )
sys.exit(-1)
return _run_search_dump(args, search)
@@ -150,8 +157,9 @@ def run_ingest_query(args):
way as searches in the fatcat web interface.
"""
- search = _init_search(args) \
- .filter("term", in_ia=False) \
+ search = (
+ _init_search(args)
+ .filter("term", in_ia=False)
.query(
"query_string",
query=args.query,
@@ -160,6 +168,7 @@ def run_ingest_query(args):
lenient=True,
fields=["title^5", "contrib_names^2", "container_title"],
)
+ )
return _run_search_dump(args, search)
@@ -169,86 +178,96 @@ def run_ingest_extid(args):
Selects release entities where the external identifier (extid) exists
"""
- search = _init_search(args) \
- .filter("term", in_ia=False) \
- .filter("exists", field=args.extid)
+ search = _init_search(args).filter("term", in_ia=False).filter("exists", field=args.extid)
return _run_search_dump(args, search)
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--fatcat-api-url',
- default="http://localhost:9411/v0",
- help="connect to this host/port")
- parser.add_argument('--enqueue-kafka',
- action='store_true',
- help="send ingest requests directly to sandcrawler kafka topic for processing")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument(
+ "--enqueue-kafka",
+ action="store_true",
+ help="send ingest requests directly to sandcrawler kafka topic for processing",
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--kafka-request-topic',
- help="exact Kafka ingest request topic to use")
- parser.add_argument('--elasticsearch-endpoint',
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument("--kafka-request-topic", help="exact Kafka ingest request topic to use")
+ parser.add_argument(
+ "--elasticsearch-endpoint",
default="https://search.fatcat.wiki",
- help="elasticsearch API. internal endpoint preferred, but public is default")
- parser.add_argument('--elasticsearch-index',
- default="fatcat_release",
- help="elasticsearch index to query")
- parser.add_argument('--env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
- parser.add_argument('--limit',
- default=None,
- type=int,
- help="Max number of search hits to return")
- parser.add_argument('--dry-run',
- action='store_true',
- help="runs through creating all ingest requests, but doesn't actually output or enqueue")
- parser.add_argument('--before-year',
+ help="elasticsearch API. internal endpoint preferred, but public is default",
+ )
+ parser.add_argument(
+ "--elasticsearch-index", default="fatcat_release", help="elasticsearch index to query"
+ )
+ parser.add_argument(
+ "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
+ parser.add_argument(
+ "--limit", default=None, type=int, help="Max number of search hits to return"
+ )
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ help="runs through creating all ingest requests, but doesn't actually output or enqueue",
+ )
+ parser.add_argument(
+ "--before-year",
type=str,
- help="filters results to only with release_year before this (not inclusive)")
- parser.add_argument('--after-year',
+ help="filters results to only with release_year before this (not inclusive)",
+ )
+ parser.add_argument(
+ "--after-year",
type=str,
- help="filters results to only with release_year after this (inclusive)")
- parser.add_argument('--release-types',
+ help="filters results to only with release_year after this (inclusive)",
+ )
+ parser.add_argument(
+ "--release-types",
type=str,
- help="filters results to specified release-types, separated by commas. By default, 'stub' is filtered out.")
- parser.add_argument('--allow-non-oa',
- action='store_true',
- help="By default, we limit to OA releases. This removes that filter")
- parser.add_argument('--force-recrawl',
- action='store_true',
- help="Tell ingest worker to skip GWB history lookup and do SPNv2 crawl")
- parser.add_argument('--ingest-type',
- default="pdf",
- help="What medium to ingest (pdf, xml, html)")
+ help="filters results to specified release-types, separated by commas. By default, 'stub' is filtered out.",
+ )
+ parser.add_argument(
+ "--allow-non-oa",
+ action="store_true",
+ help="By default, we limit to OA releases. This removes that filter",
+ )
+ parser.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="Tell ingest worker to skip GWB history lookup and do SPNv2 crawl",
+ )
+ parser.add_argument(
+ "--ingest-type", default="pdf", help="What medium to ingest (pdf, xml, html)"
+ )
subparsers = parser.add_subparsers()
- sub_container = subparsers.add_parser('container',
- help="Create ingest requests for releases from a specific container")
+ sub_container = subparsers.add_parser(
+ "container", help="Create ingest requests for releases from a specific container"
+ )
sub_container.set_defaults(func=run_ingest_container)
- sub_container.add_argument('--container-id',
- help="fatcat container entity ident")
- sub_container.add_argument('--issnl',
- help="ISSN-L of container entity")
- sub_container.add_argument('--publisher',
- help="publisher name")
- sub_container.add_argument('--name',
- help="container name")
-
- sub_query = subparsers.add_parser('query',
- help="Create ingest requests for releases from a specific query")
+ sub_container.add_argument("--container-id", help="fatcat container entity ident")
+ sub_container.add_argument("--issnl", help="ISSN-L of container entity")
+ sub_container.add_argument("--publisher", help="publisher name")
+ sub_container.add_argument("--name", help="container name")
+
+ sub_query = subparsers.add_parser(
+ "query", help="Create ingest requests for releases from a specific query"
+ )
sub_query.set_defaults(func=run_ingest_query)
- sub_query.add_argument('query',
- help="search query (same DSL as web interface search)")
+ sub_query.add_argument("query", help="search query (same DSL as web interface search)")
- sub_extid = subparsers.add_parser('extid',
- help="Create ingest requests for releases that have given extid defined")
+ sub_extid = subparsers.add_parser(
+ "extid", help="Create ingest requests for releases that have given extid defined"
+ )
sub_extid.set_defaults(func=run_ingest_extid)
- sub_extid.add_argument('extid',
- help="extid short name (as included in ES release schema)")
+ sub_extid.add_argument("extid", help="extid short name (as included in ES release schema)")
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -258,5 +277,6 @@ def main():
args.api = public_api(args.fatcat_api_url)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_review.py b/python/fatcat_review.py
index f719bb46..0cdfc29d 100755
--- a/python/fatcat_review.py
+++ b/python/fatcat_review.py
@@ -13,39 +13,43 @@ sentry_client = raven.Client()
def run_dummy(args):
- reviewer = DummyReviewBot(args.api, poll_interval=args.poll_interval,
- verbose=args.verbose)
+ reviewer = DummyReviewBot(args.api, poll_interval=args.poll_interval, verbose=args.verbose)
if args.editgroup:
annotation = reviewer.run_single(args.editgroup, args.annotate)
print(annotation)
elif args.continuous:
reviewer.run()
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--verbose',
- action='store_true',
- help="enable verbose output")
- parser.add_argument('--fatcat-api-url',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument("--verbose", action="store_true", help="enable verbose output")
+ parser.add_argument(
+ "--fatcat-api-url",
default="http://localhost:9411/v0",
- help="fatcat API host/port to use")
- parser.add_argument('--poll-interval',
+ help="fatcat API host/port to use",
+ )
+ parser.add_argument(
+ "--poll-interval",
help="how long to wait between polling (seconds)",
- default=10.0, type=float)
+ default=10.0,
+ type=float,
+ )
subparsers = parser.add_subparsers()
- sub_dummy = subparsers.add_parser('dummy',
- help="example/demonstration review bot")
+ sub_dummy = subparsers.add_parser("dummy", help="example/demonstration review bot")
sub_dummy.set_defaults(func=run_dummy)
- sub_dummy.add_argument("--continuous",
+ sub_dummy.add_argument(
+ "--continuous",
action="store_true",
- help="run forever, polling for new reviewable editgroups")
- sub_dummy.add_argument("--editgroup",
- help="single editgroup ID to review")
- sub_dummy.add_argument("--annotate",
+ help="run forever, polling for new reviewable editgroups",
+ )
+ sub_dummy.add_argument("--editgroup", help="single editgroup ID to review")
+ sub_dummy.add_argument(
+ "--annotate",
action="store_true",
- help="for single editgroups, pushes result as annotation")
+ help="for single editgroups, pushes result as annotation",
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -58,5 +62,6 @@ def main():
args.api = authenticated_api(args.fatcat_api_url)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py
index ab855dbf..fe2e12a6 100755
--- a/python/fatcat_transform.py
+++ b/python/fatcat_transform.py
@@ -31,10 +31,10 @@ def run_elasticsearch_releases(args):
if not line:
continue
entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
- if entity.state != 'active':
+ if entity.state != "active":
continue
- args.json_output.write(
- json.dumps(release_to_elasticsearch(entity)) + '\n')
+ args.json_output.write(json.dumps(release_to_elasticsearch(entity)) + "\n")
+
def run_elasticsearch_containers(args):
es_client = elasticsearch.Elasticsearch(args.fatcat_elasticsearch_url)
@@ -44,7 +44,7 @@ def run_elasticsearch_containers(args):
if not line:
continue
entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client)
- if entity.state != 'active':
+ if entity.state != "active":
continue
if args.query_stats:
@@ -60,7 +60,8 @@ def run_elasticsearch_containers(args):
else:
es_doc = container_to_elasticsearch(entity)
- args.json_output.write(json.dumps(es_doc) + '\n')
+ args.json_output.write(json.dumps(es_doc) + "\n")
+
def run_elasticsearch_files(args):
for line in args.json_input:
@@ -68,10 +69,10 @@ def run_elasticsearch_files(args):
if not line:
continue
entity = entity_from_json(line, FileEntity, api_client=args.api.api_client)
- if entity.state != 'active':
+ if entity.state != "active":
continue
- args.json_output.write(
- json.dumps(file_to_elasticsearch(entity)) + '\n')
+ args.json_output.write(json.dumps(file_to_elasticsearch(entity)) + "\n")
+
def run_elasticsearch_changelogs(args):
for line in args.json_input:
@@ -79,8 +80,8 @@ def run_elasticsearch_changelogs(args):
if not line:
continue
entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client)
- args.json_output.write(
- json.dumps(changelog_to_elasticsearch(entity)) + '\n')
+ args.json_output.write(json.dumps(changelog_to_elasticsearch(entity)) + "\n")
+
def run_citeproc_releases(args):
for line in args.json_input:
@@ -88,82 +89,126 @@ def run_citeproc_releases(args):
if not line:
continue
entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
- if entity.state != 'active':
+ if entity.state != "active":
continue
csl_json = release_to_csl(entity)
- csl_json['id'] = "release:" + (entity.ident or "unknown")
+ csl_json["id"] = "release:" + (entity.ident or "unknown")
out = citeproc_csl(csl_json, args.style, args.html)
args.json_output.write(out + "\n")
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--fatcat-api-url',
- default="http://localhost:9411/v0",
- help="connect to this host/port")
- parser.add_argument('--fatcat-elasticsearch-url',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument(
+ "--fatcat-elasticsearch-url",
default="http://localhost:9200",
- help="connect to this host/port")
+ help="connect to this host/port",
+ )
subparsers = parser.add_subparsers()
- sub_elasticsearch_releases = subparsers.add_parser('elasticsearch-releases',
- help="convert fatcat release JSON schema to elasticsearch release schema")
+ sub_elasticsearch_releases = subparsers.add_parser(
+ "elasticsearch-releases",
+ help="convert fatcat release JSON schema to elasticsearch release schema",
+ )
sub_elasticsearch_releases.set_defaults(func=run_elasticsearch_releases)
- sub_elasticsearch_releases.add_argument('json_input',
+ sub_elasticsearch_releases.add_argument(
+ "json_input",
help="JSON-per-line of release entities",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_elasticsearch_releases.add_argument('json_output',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_elasticsearch_releases.add_argument(
+ "json_output",
help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
-
- sub_elasticsearch_containers = subparsers.add_parser('elasticsearch-containers',
- help="convert fatcat container JSON schema to elasticsearch container schema")
+ default=sys.stdout,
+ type=argparse.FileType("w"),
+ )
+
+ sub_elasticsearch_containers = subparsers.add_parser(
+ "elasticsearch-containers",
+ help="convert fatcat container JSON schema to elasticsearch container schema",
+ )
sub_elasticsearch_containers.set_defaults(func=run_elasticsearch_containers)
- sub_elasticsearch_containers.add_argument('json_input',
+ sub_elasticsearch_containers.add_argument(
+ "json_input",
help="JSON-per-line of container entities",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_elasticsearch_containers.add_argument('json_output',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_elasticsearch_containers.add_argument(
+ "json_output",
help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
- sub_elasticsearch_containers.add_argument('--query-stats',
- action='store_true',
- help="whether to query release search index for container stats")
-
- sub_elasticsearch_files = subparsers.add_parser('elasticsearch-files',
- help="convert fatcat file JSON schema to elasticsearch file schema")
+ default=sys.stdout,
+ type=argparse.FileType("w"),
+ )
+ sub_elasticsearch_containers.add_argument(
+ "--query-stats",
+ action="store_true",
+ help="whether to query release search index for container stats",
+ )
+
+ sub_elasticsearch_files = subparsers.add_parser(
+ "elasticsearch-files",
+ help="convert fatcat file JSON schema to elasticsearch file schema",
+ )
sub_elasticsearch_files.set_defaults(func=run_elasticsearch_files)
- sub_elasticsearch_files.add_argument('json_input',
+ sub_elasticsearch_files.add_argument(
+ "json_input",
help="JSON-per-line of file entities",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_elasticsearch_files.add_argument('json_output',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_elasticsearch_files.add_argument(
+ "json_output",
help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
-
- sub_elasticsearch_changelogs = subparsers.add_parser('elasticsearch-changelogs',
- help="convert fatcat changelog JSON schema to elasticsearch changelog schema")
+ default=sys.stdout,
+ type=argparse.FileType("w"),
+ )
+
+ sub_elasticsearch_changelogs = subparsers.add_parser(
+ "elasticsearch-changelogs",
+ help="convert fatcat changelog JSON schema to elasticsearch changelog schema",
+ )
sub_elasticsearch_changelogs.set_defaults(func=run_elasticsearch_changelogs)
- sub_elasticsearch_changelogs.add_argument('json_input',
+ sub_elasticsearch_changelogs.add_argument(
+ "json_input",
help="JSON-per-line of changelog entries",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_elasticsearch_changelogs.add_argument('json_output',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_elasticsearch_changelogs.add_argument(
+ "json_output",
help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
-
- sub_citeproc_releases = subparsers.add_parser('citeproc-releases',
- help="convert fatcat release schema to any standard citation format using citeproc/CSL")
+ default=sys.stdout,
+ type=argparse.FileType("w"),
+ )
+
+ sub_citeproc_releases = subparsers.add_parser(
+ "citeproc-releases",
+ help="convert fatcat release schema to any standard citation format using citeproc/CSL",
+ )
sub_citeproc_releases.set_defaults(func=run_citeproc_releases)
- sub_citeproc_releases.add_argument('json_input',
+ sub_citeproc_releases.add_argument(
+ "json_input",
help="JSON-per-line of release entities",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_citeproc_releases.add_argument('json_output',
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+ sub_citeproc_releases.add_argument(
+ "json_output",
help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
- sub_citeproc_releases.add_argument('--style',
- help="citation style to output",
- default='csl-json')
- sub_citeproc_releases.add_argument('--html',
- action='store_true',
- help="output HTML, not plain text")
+ default=sys.stdout,
+ type=argparse.FileType("w"),
+ )
+ sub_citeproc_releases.add_argument(
+ "--style", help="citation style to output", default="csl-json"
+ )
+ sub_citeproc_releases.add_argument(
+ "--html", action="store_true", help="output HTML, not plain text"
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -173,5 +218,6 @@ def main():
args.api = public_api(args.fatcat_api_url)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_util.py b/python/fatcat_util.py
index a8e99ac3..57102e9e 100755
--- a/python/fatcat_util.py
+++ b/python/fatcat_util.py
@@ -16,47 +16,50 @@ from fatcat_tools import authenticated_api, fcid2uuid, uuid2fcid
def run_uuid2fcid(args):
print(uuid2fcid(args.uuid))
+
def run_fcid2uuid(args):
print(fcid2uuid(args.fcid))
+
def run_editgroup_accept(args):
args.api.accept_editgroup(args.editgroup_id)
+
def run_editgroup_submit(args):
eg = args.api.get_editgroup(args.editgroup_id)
args.api.update_editgroup(args.editgroup_id, eg, submit=True)
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--fatcat-api-url',
- default="http://localhost:9411/v0",
- help="connect to this host/port")
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
subparsers = parser.add_subparsers()
- sub_uuid2fcid = subparsers.add_parser('uuid2fcid',
- help="convert a standard UUID (as string) to fatcat ident format")
+ sub_uuid2fcid = subparsers.add_parser(
+ "uuid2fcid", help="convert a standard UUID (as string) to fatcat ident format"
+ )
sub_uuid2fcid.set_defaults(func=run_uuid2fcid)
- sub_uuid2fcid.add_argument('uuid',
- help="UUID to transform")
+ sub_uuid2fcid.add_argument("uuid", help="UUID to transform")
- sub_fcid2uuid = subparsers.add_parser('fcid2uuid',
- help="convert a fatcat ident string to standard UUID format")
+ sub_fcid2uuid = subparsers.add_parser(
+ "fcid2uuid", help="convert a fatcat ident string to standard UUID format"
+ )
sub_fcid2uuid.set_defaults(func=run_fcid2uuid)
- sub_fcid2uuid.add_argument('fcid',
- help="FCID to transform (into UUID)")
+ sub_fcid2uuid.add_argument("fcid", help="FCID to transform (into UUID)")
- sub_editgroup_accept = subparsers.add_parser('editgroup-accept',
- help="accept an editgroup (by ident)")
+ sub_editgroup_accept = subparsers.add_parser(
+ "editgroup-accept", help="accept an editgroup (by ident)"
+ )
sub_editgroup_accept.set_defaults(func=run_editgroup_accept)
- sub_editgroup_accept.add_argument('editgroup_id',
- help="editgroup to accept")
+ sub_editgroup_accept.add_argument("editgroup_id", help="editgroup to accept")
- sub_editgroup_submit = subparsers.add_parser('editgroup-submit',
- help="submit an editgroup for review (by ident)")
+ sub_editgroup_submit = subparsers.add_parser(
+ "editgroup-submit", help="submit an editgroup for review (by ident)"
+ )
sub_editgroup_submit.set_defaults(func=run_editgroup_submit)
- sub_editgroup_submit.add_argument('editgroup_id',
- help="editgroup to submit")
+ sub_editgroup_submit.add_argument("editgroup_id", help="editgroup to submit")
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -66,5 +69,6 @@ def main():
args.api = authenticated_api(args.fatcat_api_url)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_webface.py b/python/fatcat_webface.py
index d12e8dad..acaa5936 100755
--- a/python/fatcat_webface.py
+++ b/python/fatcat_webface.py
@@ -6,21 +6,18 @@ from fatcat_web import app
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--debug',
- action='store_true',
- help="enable debugging interface (note: not for everything)")
- parser.add_argument('--host',
- default="127.0.0.1",
- help="listen on this host/IP")
- parser.add_argument('--port',
- type=int,
- default=9810,
- help="listen on this port")
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--debug",
+ action="store_true",
+ help="enable debugging interface (note: not for everything)",
+ )
+ parser.add_argument("--host", default="127.0.0.1", help="listen on this host/IP")
+ parser.add_argument("--port", type=int, default=9810, help="listen on this port")
args = parser.parse_args()
app.run(debug=args.debug, host=args.host, port=args.port)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py
index 397cf731..b776e0ce 100755
--- a/python/fatcat_worker.py
+++ b/python/fatcat_worker.py
@@ -20,10 +20,12 @@ sentry_client = raven.Client()
def run_changelog(args):
topic = "fatcat-{}.changelog".format(args.env)
- worker = ChangelogWorker(args.api, args.kafka_hosts, topic,
- poll_interval=args.poll_interval)
+ worker = ChangelogWorker(
+ args.api, args.kafka_hosts, topic, poll_interval=args.poll_interval
+ )
worker.run()
+
def run_entity_updates(args):
changelog_topic = "fatcat-{}.changelog".format(args.env)
release_topic = "fatcat-{}.release-updates-v03".format(args.env)
@@ -31,7 +33,9 @@ def run_entity_updates(args):
container_topic = "fatcat-{}.container-updates".format(args.env)
work_ident_topic = "fatcat-{}.work-ident-updates".format(args.env)
ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env)
- worker = EntityUpdatesWorker(args.api, args.kafka_hosts,
+ worker = EntityUpdatesWorker(
+ args.api,
+ args.kafka_hosts,
changelog_topic,
release_topic=release_topic,
file_topic=file_topic,
@@ -41,86 +45,126 @@ def run_entity_updates(args):
)
worker.run()
+
def run_elasticsearch_release(args):
consume_topic = "fatcat-{}.release-updates-v03".format(args.env)
- worker = ElasticsearchReleaseWorker(args.kafka_hosts, consume_topic,
+ worker = ElasticsearchReleaseWorker(
+ args.kafka_hosts,
+ consume_topic,
elasticsearch_backend=args.elasticsearch_backend,
- elasticsearch_index=args.elasticsearch_index)
+ elasticsearch_index=args.elasticsearch_index,
+ )
worker.run()
+
def run_elasticsearch_container(args):
consume_topic = "fatcat-{}.container-updates".format(args.env)
- worker = ElasticsearchContainerWorker(args.kafka_hosts, consume_topic,
+ worker = ElasticsearchContainerWorker(
+ args.kafka_hosts,
+ consume_topic,
query_stats=args.query_stats,
elasticsearch_release_index="fatcat_release",
elasticsearch_backend=args.elasticsearch_backend,
- elasticsearch_index=args.elasticsearch_index)
+ elasticsearch_index=args.elasticsearch_index,
+ )
worker.run()
+
def run_elasticsearch_changelog(args):
consume_topic = "fatcat-{}.changelog".format(args.env)
- worker = ElasticsearchChangelogWorker(args.kafka_hosts, consume_topic,
+ worker = ElasticsearchChangelogWorker(
+ args.kafka_hosts,
+ consume_topic,
elasticsearch_backend=args.elasticsearch_backend,
- elasticsearch_index=args.elasticsearch_index)
+ elasticsearch_index=args.elasticsearch_index,
+ )
worker.run()
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--api-host-url',
- default="http://localhost:9411/v0",
- help="fatcat API host/port to use")
- parser.add_argument('--kafka-hosts',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--api-host-url", default="http://localhost:9411/v0", help="fatcat API host/port to use"
+ )
+ parser.add_argument(
+ "--kafka-hosts",
default="localhost:9092",
- help="list of Kafka brokers (host/port) to use")
- parser.add_argument('--env',
- default="dev",
- help="Kafka topic namespace to use (eg, prod, qa, dev)")
+ help="list of Kafka brokers (host/port) to use",
+ )
+ parser.add_argument(
+ "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)"
+ )
subparsers = parser.add_subparsers()
- sub_changelog = subparsers.add_parser('changelog',
- help="poll fatcat API for changelog entries, push to kafka")
+ sub_changelog = subparsers.add_parser(
+ "changelog", help="poll fatcat API for changelog entries, push to kafka"
+ )
sub_changelog.set_defaults(func=run_changelog)
- sub_changelog.add_argument('--poll-interval',
+ sub_changelog.add_argument(
+ "--poll-interval",
help="how long to wait between polling (seconds)",
- default=5.0, type=float)
+ default=5.0,
+ type=float,
+ )
- sub_entity_updates = subparsers.add_parser('entity-updates',
- help="poll kafka for changelog entries; push entity changes to various kafka topics")
+ sub_entity_updates = subparsers.add_parser(
+ "entity-updates",
+ help="poll kafka for changelog entries; push entity changes to various kafka topics",
+ )
sub_entity_updates.set_defaults(func=run_entity_updates)
- sub_elasticsearch_release = subparsers.add_parser('elasticsearch-release',
- help="consume kafka feed of new/updated releases, transform and push to search")
+ sub_elasticsearch_release = subparsers.add_parser(
+ "elasticsearch-release",
+ help="consume kafka feed of new/updated releases, transform and push to search",
+ )
sub_elasticsearch_release.set_defaults(func=run_elasticsearch_release)
- sub_elasticsearch_release.add_argument('--elasticsearch-backend',
+ sub_elasticsearch_release.add_argument(
+ "--elasticsearch-backend",
help="elasticsearch backend to connect to",
- default="http://localhost:9200")
- sub_elasticsearch_release.add_argument('--elasticsearch-index',
+ default="http://localhost:9200",
+ )
+ sub_elasticsearch_release.add_argument(
+ "--elasticsearch-index",
help="elasticsearch index to push into",
- default="fatcat_release_v03")
+ default="fatcat_release_v03",
+ )
- sub_elasticsearch_container = subparsers.add_parser('elasticsearch-container',
- help="consume kafka feed of new/updated containers, transform and push to search")
+ sub_elasticsearch_container = subparsers.add_parser(
+ "elasticsearch-container",
+ help="consume kafka feed of new/updated containers, transform and push to search",
+ )
sub_elasticsearch_container.set_defaults(func=run_elasticsearch_container)
- sub_elasticsearch_container.add_argument('--elasticsearch-backend',
+ sub_elasticsearch_container.add_argument(
+ "--elasticsearch-backend",
help="elasticsearch backend to connect to",
- default="http://localhost:9200")
- sub_elasticsearch_container.add_argument('--elasticsearch-index',
+ default="http://localhost:9200",
+ )
+ sub_elasticsearch_container.add_argument(
+ "--elasticsearch-index",
help="elasticsearch index to push into",
- default="fatcat_container")
- sub_elasticsearch_container.add_argument('--query-stats',
- action='store_true',
- help="whether to query release search index for container stats")
+ default="fatcat_container",
+ )
+ sub_elasticsearch_container.add_argument(
+ "--query-stats",
+ action="store_true",
+ help="whether to query release search index for container stats",
+ )
- sub_elasticsearch_changelog = subparsers.add_parser('elasticsearch-changelog',
- help="consume changelog kafka feed, transform and push to search")
+ sub_elasticsearch_changelog = subparsers.add_parser(
+ "elasticsearch-changelog",
+ help="consume changelog kafka feed, transform and push to search",
+ )
sub_elasticsearch_changelog.set_defaults(func=run_elasticsearch_changelog)
- sub_elasticsearch_changelog.add_argument('--elasticsearch-backend',
+ sub_elasticsearch_changelog.add_argument(
+ "--elasticsearch-backend",
help="elasticsearch backend to connect to",
- default="http://localhost:9200")
- sub_elasticsearch_changelog.add_argument('--elasticsearch-index',
+ default="http://localhost:9200",
+ )
+ sub_elasticsearch_changelog.add_argument(
+ "--elasticsearch-index",
help="elasticsearch index to push into",
- default="fatcat_changelog")
+ default="fatcat_changelog",
+ )
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -130,5 +174,6 @@ def main():
args.api = public_api(args.api_host_url)
args.func(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()
diff --git a/python/shell.py b/python/shell.py
index 9b561f5f..e8879a70 100644
--- a/python/shell.py
+++ b/python/shell.py
@@ -12,9 +12,9 @@ from fatcat_openapi_client.rest import ApiException
from fatcat_tools import *
-if __name__ == '__main__':
+if __name__ == "__main__":
- #api =
+ # api =
print(" __ _ _ _ ")
print(" / _| __ _| |_ ___ __ _| |_| |")
print("| |_ / _` | __/ __/ _` | __| |")
@@ -24,23 +24,27 @@ if __name__ == '__main__':
admin_id = "aaaaaaaaaaaabkvkaaaaaaaaae"
- #fatcat_openapi_client.configuration.api_key['Authorization'] = 'YOUR_API_KEY'
- #fatcat_openapi_client.configuration.api_key_prefix['Authorization'] = 'Bearer'
+ # fatcat_openapi_client.configuration.api_key['Authorization'] = 'YOUR_API_KEY'
+ # fatcat_openapi_client.configuration.api_key_prefix['Authorization'] = 'Bearer'
local_conf = fatcat_openapi_client.Configuration()
- local_conf.api_key["Authorization"] = "AgEPZGV2LmZhdGNhdC53aWtpAhYyMDE5MDEwMS1kZXYtZHVtbXkta2V5AAImZWRpdG9yX2lkID0gYWFhYWFhYWFhYWFhYmt2a2FhYWFhYWFhYWkAAht0aW1lID4gMjAxOS0wMS0wOVQwMDo1Nzo1MloAAAYgnroNha1hSftChtxHGTnLEmM/pY8MeQS/jBSV0UNvXug="
+ local_conf.api_key[
+ "Authorization"
+ ] = "AgEPZGV2LmZhdGNhdC53aWtpAhYyMDE5MDEwMS1kZXYtZHVtbXkta2V5AAImZWRpdG9yX2lkID0gYWFhYWFhYWFhYWFhYmt2a2FhYWFhYWFhYWkAAht0aW1lID4gMjAxOS0wMS0wOVQwMDo1Nzo1MloAAAYgnroNha1hSftChtxHGTnLEmM/pY8MeQS/jBSV0UNvXug="
local_conf.api_key_prefix["Authorization"] = "Bearer"
- local_conf.host = 'http://localhost:9411/v0'
+ local_conf.host = "http://localhost:9411/v0"
local_conf.debug = True
local_api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(local_conf))
- #prod_conf = fatcat_openapi_client.Configuration()
- #prod_conf.api_key["Authorization"] = "AgEPZGV2LmZhdGNhdC53aWtpAg4yMDE4LTEyLTMxLWRldgACJmVkaXRvcl9pZCA9IGFhYWFhYWFhYWFhYWJrdmthYWFhYWFhYWFlAAIeY3JlYXRlZCA9IDIwMTgtMTItMzFUMjE6MTU6NDdaAAAGIMWFZeZ54pH4OzNl5+U5X3p1H1rMioSuIldihuiM5XAw"
- #prod_conf.api_key_prefix["Authorization"] = "Bearer"
- #prod_conf.host = 'https://api.fatcat.wiki/v0'
- #prod_api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(prod_conf))
+ # prod_conf = fatcat_openapi_client.Configuration()
+ # prod_conf.api_key["Authorization"] = "AgEPZGV2LmZhdGNhdC53aWtpAg4yMDE4LTEyLTMxLWRldgACJmVkaXRvcl9pZCA9IGFhYWFhYWFhYWFhYWJrdmthYWFhYWFhYWFlAAIeY3JlYXRlZCA9IDIwMTgtMTItMzFUMjE6MTU6NDdaAAAGIMWFZeZ54pH4OzNl5+U5X3p1H1rMioSuIldihuiM5XAw"
+ # prod_conf.api_key_prefix["Authorization"] = "Bearer"
+ # prod_conf.host = 'https://api.fatcat.wiki/v0'
+ # prod_api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(prod_conf))
qa_conf = fatcat_openapi_client.Configuration()
- qa_conf.api_key["Authorization"] = "AgEPZGV2LmZhdGNhdC53aWtpAg4yMDE4LTEyLTMxLWRldgACJmVkaXRvcl9pZCA9IGFhYWFhYWFhYWFhYWJrdmthYWFhYWFhYWFlAAIeY3JlYXRlZCA9IDIwMTgtMTItMzFUMjE6MTU6NDdaAAAGIMWFZeZ54pH4OzNl5+U5X3p1H1rMioSuIldihuiM5XAw"
+ qa_conf.api_key[
+ "Authorization"
+ ] = "AgEPZGV2LmZhdGNhdC53aWtpAg4yMDE4LTEyLTMxLWRldgACJmVkaXRvcl9pZCA9IGFhYWFhYWFhYWFhYWJrdmthYWFhYWFhYWFlAAIeY3JlYXRlZCA9IDIwMTgtMTItMzFUMjE6MTU6NDdaAAAGIMWFZeZ54pH4OzNl5+U5X3p1H1rMioSuIldihuiM5XAw"
qa_conf.api_key_prefix["Authorization"] = "Bearer"
- qa_conf.host = 'https://api.qa.fatcat.wiki/v0'
+ qa_conf.host = "https://api.qa.fatcat.wiki/v0"
qa_api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(qa_conf))