diff options
Diffstat (limited to 'python/fatcat_import.py')
-rwxr-xr-x | python/fatcat_import.py | 950 |
1 files changed, 591 insertions, 359 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 3225688c..42001974 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -51,11 +51,13 @@ sentry_client = raven.Client() def run_crossref(args): - fci = CrossrefImporter(args.api, + fci = CrossrefImporter( + args.api, args.issn_map_file, extid_map_file=args.extid_map_file, edit_batch_size=args.batch_size, - bezerk_mode=args.bezerk_mode) + bezerk_mode=args.bezerk_mode, + ) if args.kafka_mode: KafkaJsonPusher( fci, @@ -68,15 +70,14 @@ def run_crossref(args): else: JsonLinePusher(fci, args.json_file).run() + def run_jalc(args): - ji = JalcImporter(args.api, - args.issn_map_file, - extid_map_file=args.extid_map_file) + ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file) Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run() + def run_arxiv(args): - ari = ArxivRawImporter(args.api, - edit_batch_size=args.batch_size) + ari = ArxivRawImporter(args.api, edit_batch_size=args.batch_size) if args.kafka_mode: KafkaBs4XmlPusher( ari, @@ -87,15 +88,18 @@ def run_arxiv(args): ).run() else: if args.xml_file == sys.stdin: - print('note: reading from stdin', file=sys.stderr) + print("note: reading from stdin", file=sys.stderr) Bs4XmlFilePusher(ari, args.xml_file, "record").run() + def run_pubmed(args): - pi = PubmedImporter(args.api, + pi = PubmedImporter( + args.api, args.issn_map_file, edit_batch_size=args.batch_size, do_updates=args.do_updates, - lookup_refs=(not args.no_lookup_refs)) + lookup_refs=(not args.no_lookup_refs), + ) if args.kafka_mode: KafkaBs4XmlPusher( pi, @@ -111,62 +115,67 @@ def run_pubmed(args): ["PubmedArticle"], ).run() + def run_jstor(args): - ji = JstorImporter(args.api, - args.issn_map_file, - edit_batch_size=args.batch_size) + ji = JstorImporter(args.api, args.issn_map_file, edit_batch_size=args.batch_size) Bs4XmlFileListPusher(ji, args.list_file, "article").run() + def run_orcid(args): - foi = OrcidImporter(args.api, - edit_batch_size=args.batch_size) + foi = OrcidImporter(args.api, edit_batch_size=args.batch_size) JsonLinePusher(foi, args.json_file).run() + def run_journal_metadata(args): - fii = JournalMetadataImporter(args.api, - edit_batch_size=args.batch_size) + fii = JournalMetadataImporter(args.api, edit_batch_size=args.batch_size) JsonLinePusher(fii, args.json_file).run() + def run_chocula(args): - fii = ChoculaImporter(args.api, - edit_batch_size=args.batch_size, - do_updates=args.do_updates) + fii = ChoculaImporter(args.api, edit_batch_size=args.batch_size, do_updates=args.do_updates) JsonLinePusher(fii, args.json_file).run() + def run_matched(args): - fmi = MatchedImporter(args.api, + fmi = MatchedImporter( + args.api, edit_batch_size=args.batch_size, editgroup_description=args.editgroup_description_override, default_link_rel=args.default_link_rel, - default_mimetype=args.default_mimetype) + default_mimetype=args.default_mimetype, + ) JsonLinePusher(fmi, args.json_file).run() + def run_arabesque_match(args): - if (args.sqlite_file and args.json_file) or not (args.sqlite_file or - args.json_file): + if (args.sqlite_file and args.json_file) or not (args.sqlite_file or args.json_file): print("Supply one of --sqlite-file or --json-file") - ami = ArabesqueMatchImporter(args.api, + ami = ArabesqueMatchImporter( + args.api, editgroup_description=args.editgroup_description_override, do_updates=args.do_updates, require_grobid=(not args.no_require_grobid), extid_type=args.extid_type, crawl_id=args.crawl_id, default_link_rel=args.default_link_rel, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.sqlite_file: - SqlitePusher(ami, args.sqlite_file, "crawl_result", - ARABESQUE_MATCH_WHERE_CLAUSE).run() + SqlitePusher(ami, args.sqlite_file, "crawl_result", ARABESQUE_MATCH_WHERE_CLAUSE).run() elif args.json_file: JsonLinePusher(ami, args.json_file).run() + def run_ingest_file(args): - ifri = IngestFileResultImporter(args.api, + ifri = IngestFileResultImporter( + args.api, editgroup_description=args.editgroup_description_override, skip_source_allowlist=args.skip_source_allowlist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, require_grobid=(not args.no_require_grobid), - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -180,13 +189,16 @@ def run_ingest_file(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_ingest_web(args): - iwri = IngestWebResultImporter(args.api, + iwri = IngestWebResultImporter( + args.api, editgroup_description=args.editgroup_description_override, skip_source_allowlist=args.skip_source_allowlist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( iwri, @@ -201,13 +213,16 @@ def run_ingest_web(args): else: JsonLinePusher(iwri, args.json_file).run() + def run_ingest_fileset(args): - ifri = IngestFilesetResultImporter(args.api, + ifri = IngestFilesetResultImporter( + args.api, editgroup_description=args.editgroup_description_override, skip_source_allowlist=args.skip_source_allowlist, do_updates=args.do_updates, default_link_rel=args.default_link_rel, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -222,10 +237,13 @@ def run_ingest_fileset(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_savepapernow_file(args): - ifri = SavePaperNowFileImporter(args.api, + ifri = SavePaperNowFileImporter( + args.api, editgroup_description=args.editgroup_description_override, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -240,10 +258,13 @@ def run_savepapernow_file(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_savepapernow_web(args): - ifri = SavePaperNowWebImporter(args.api, + ifri = SavePaperNowWebImporter( + args.api, editgroup_description=args.editgroup_description_override, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -258,10 +279,13 @@ def run_savepapernow_web(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_savepapernow_fileset(args): - ifri = SavePaperNowFilesetImporter(args.api, + ifri = SavePaperNowFilesetImporter( + args.api, editgroup_description=args.editgroup_description_override, - edit_batch_size=args.batch_size) + edit_batch_size=args.batch_size, + ) if args.kafka_mode: KafkaJsonPusher( ifri, @@ -276,18 +300,22 @@ def run_savepapernow_fileset(args): else: JsonLinePusher(ifri, args.json_file).run() + def run_grobid_metadata(args): - fmi = GrobidMetadataImporter(args.api, + fmi = GrobidMetadataImporter( + args.api, edit_batch_size=args.batch_size, longtail_oa=args.longtail_oa, - bezerk_mode=args.bezerk_mode) + bezerk_mode=args.bezerk_mode, + ) LinePusher(fmi, args.tsv_file).run() + def run_shadow_lib(args): - fmi = ShadowLibraryImporter(args.api, - edit_batch_size=100) + fmi = ShadowLibraryImporter(args.api, edit_batch_size=100) JsonLinePusher(fmi, args.json_file).run() + def run_wayback_static(args): api = args.api @@ -295,8 +323,8 @@ def run_wayback_static(args): if args.release_id: release_id = args.release_id elif args.extid: - idtype = args.extid.split(':')[0] - extid = ':'.join(args.extid.split(':')[1:]) + idtype = args.extid.split(":")[0] + extid = ":".join(args.extid.split(":")[1:]) if idtype == "doi": release_id = api.lookup_release(doi=extid).ident elif idtype == "pmid": @@ -309,8 +337,9 @@ def run_wayback_static(args): raise Exception("need either release_id or extid argument") # create it - (editgroup_id, wc) = auto_wayback_static(api, release_id, args.wayback_url, - editgroup_id=args.editgroup_id) + (editgroup_id, wc) = auto_wayback_static( + api, release_id, args.wayback_url, editgroup_id=args.editgroup_id + ) if not wc: return print("release_id: {}".format(release_id)) @@ -318,12 +347,14 @@ def run_wayback_static(args): print("webcapture id: {}".format(wc.ident)) print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) + def run_cdl_dash_dat(args): api = args.api # create it - (editgroup_id, release, fs) = auto_cdl_dash_dat(api, args.dat_path, - release_id=args.release_id, editgroup_id=args.editgroup_id) + (editgroup_id, release, fs) = auto_cdl_dash_dat( + api, args.dat_path, release_id=args.release_id, editgroup_id=args.editgroup_id + ) if not fs: return print("release_id: {}".format(release.ident)) @@ -331,14 +362,17 @@ def run_cdl_dash_dat(args): print("fileset id: {}".format(fs.ident)) print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) + def run_datacite(args): - dci = DataciteImporter(args.api, + dci = DataciteImporter( + args.api, args.issn_map_file, edit_batch_size=args.batch_size, bezerk_mode=args.bezerk_mode, debug=args.debug, extid_map_file=args.extid_map_file, - insert_log_file=args.insert_log_file) + insert_log_file=args.insert_log_file, + ) if args.kafka_mode: KafkaJsonPusher( dci, @@ -351,8 +385,10 @@ def run_datacite(args): else: JsonLinePusher(dci, args.json_file).run() + def run_doaj_article(args): - dai = DoajArticleImporter(args.api, + dai = DoajArticleImporter( + args.api, args.issn_map_file, edit_batch_size=args.batch_size, do_updates=args.do_updates, @@ -369,8 +405,10 @@ def run_doaj_article(args): else: JsonLinePusher(dai, args.json_file).run() + def run_dblp_release(args): - dri = DblpReleaseImporter(args.api, + dri = DblpReleaseImporter( + args.api, dblp_container_map_file=args.dblp_container_map_file, edit_batch_size=args.batch_size, do_updates=args.do_updates, @@ -383,8 +421,10 @@ def run_dblp_release(args): use_lxml=True, ).run() + def run_dblp_container(args): - dci = DblpContainerImporter(args.api, + dci = DblpContainerImporter( + args.api, args.issn_map_file, dblp_container_map_file=args.dblp_container_map_file, dblp_container_map_output=args.dblp_container_map_output, @@ -393,14 +433,17 @@ def run_dblp_container(args): ) JsonLinePusher(dci, args.json_file).run() + def run_file_meta(args): # do_updates defaults to true for this importer - fmi = FileMetaImporter(args.api, + fmi = FileMetaImporter( + args.api, edit_batch_size=100, editgroup_description=args.editgroup_description_override, ) JsonLinePusher(fmi, args.json_file).run() + def run_fileset(args): fmi = FilesetImporter( args.api, @@ -409,478 +452,664 @@ def run_fileset(args): ) JsonLinePusher(fmi, args.json_file).run() + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--host-url', - default="http://localhost:9411/v0", - help="connect to this host/port") - parser.add_argument('--kafka-hosts', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" + ) + parser.add_argument( + "--kafka-hosts", default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--kafka-env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa)") - parser.add_argument('--batch-size', - help="size of batch to send", - default=50, type=int) - parser.add_argument('--editgroup-description-override', + help="list of Kafka brokers (host/port) to use", + ) + parser.add_argument( + "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa)" + ) + parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) + parser.add_argument( + "--editgroup-description-override", help="editgroup description override", - default=None, type=str) + default=None, + type=str, + ) subparsers = parser.add_subparsers() - sub_crossref = subparsers.add_parser('crossref', - help="import Crossref API metadata format (JSON)") + sub_crossref = subparsers.add_parser( + "crossref", help="import Crossref API metadata format (JSON)" + ) sub_crossref.set_defaults( func=run_crossref, auth_var="FATCAT_AUTH_WORKER_CROSSREF", ) - sub_crossref.add_argument('json_file', + sub_crossref.add_argument( + "json_file", help="crossref JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_crossref.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_crossref.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_crossref.add_argument('--extid-map-file', + default=None, + type=argparse.FileType("r"), + ) + sub_crossref.add_argument( + "--extid-map-file", help="DOI-to-other-identifiers sqlite3 database", - default=None, type=str) - sub_crossref.add_argument('--no-lookup-refs', - action='store_true', - help="skip lookup of references (PMID or DOI)") - sub_crossref.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_crossref.add_argument('--bezerk-mode', - action='store_true', - help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") - - sub_jalc = subparsers.add_parser('jalc', - help="import JALC DOI metadata from XML dump") + default=None, + type=str, + ) + sub_crossref.add_argument( + "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)" + ) + sub_crossref.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_crossref.add_argument( + "--bezerk-mode", + action="store_true", + help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)", + ) + + sub_jalc = subparsers.add_parser("jalc", help="import JALC DOI metadata from XML dump") sub_jalc.set_defaults( func=run_jalc, auth_var="FATCAT_AUTH_WORKER_JALC", ) - sub_jalc.add_argument('xml_file', + sub_jalc.add_argument( + "xml_file", help="Jalc RDF XML file (record-per-line) to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_jalc.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_jalc.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_jalc.add_argument('--extid-map-file', + default=None, + type=argparse.FileType("r"), + ) + sub_jalc.add_argument( + "--extid-map-file", help="DOI-to-other-identifiers sqlite3 database", - default=None, type=str) + default=None, + type=str, + ) - sub_arxiv = subparsers.add_parser('arxiv', - help="import arxiv.org metadata from XML files") + sub_arxiv = subparsers.add_parser("arxiv", help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( func=run_arxiv, auth_var="FATCAT_AUTH_WORKER_ARXIV", ) - sub_arxiv.add_argument('xml_file', - nargs='?', + sub_arxiv.add_argument( + "xml_file", + nargs="?", help="arXivRaw XML file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_arxiv.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_arxiv.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) - sub_pubmed = subparsers.add_parser('pubmed', - help="import MEDLINE/PubMed work-level metadata (XML)") + sub_pubmed = subparsers.add_parser( + "pubmed", help="import MEDLINE/PubMed work-level metadata (XML)" + ) sub_pubmed.set_defaults( func=run_pubmed, auth_var="FATCAT_AUTH_WORKER_PUBMED", ) - sub_pubmed.add_argument('xml_file', - nargs='?', + sub_pubmed.add_argument( + "xml_file", + nargs="?", help="Pubmed XML file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_pubmed.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_pubmed.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_pubmed.add_argument('--no-lookup-refs', - action='store_true', - help="skip lookup of references (PMID or DOI)") - sub_pubmed.add_argument('--do-updates', - action='store_true', - help="update pre-existing release entities") - sub_pubmed.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - - sub_jstor = subparsers.add_parser('jstor', - help="import JSTOR work-level metadata from XML dump") + default=None, + type=argparse.FileType("r"), + ) + sub_pubmed.add_argument( + "--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)" + ) + sub_pubmed.add_argument( + "--do-updates", action="store_true", help="update pre-existing release entities" + ) + sub_pubmed.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + + sub_jstor = subparsers.add_parser( + "jstor", help="import JSTOR work-level metadata from XML dump" + ) sub_jstor.set_defaults( func=run_jstor, auth_var="FATCAT_AUTH_WORKER_JSTOR", ) - sub_jstor.add_argument('list_file', + sub_jstor.add_argument( + "list_file", help="List of JSTOR XML file paths to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_jstor.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_jstor.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) + default=None, + type=argparse.FileType("r"), + ) - sub_orcid = subparsers.add_parser('orcid', - help="import creator entities from ORCID XML dump") - sub_orcid.set_defaults( - func=run_orcid, - auth_var="FATCAT_AUTH_WORKER_ORCID" + sub_orcid = subparsers.add_parser( + "orcid", help="import creator entities from ORCID XML dump" ) - sub_orcid.add_argument('json_file', + sub_orcid.set_defaults(func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID") + sub_orcid.add_argument( + "json_file", help="orcid JSON file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub_journal_metadata = subparsers.add_parser('journal-metadata', - help="import/update container metadata from old manual munging format") + sub_journal_metadata = subparsers.add_parser( + "journal-metadata", + help="import/update container metadata from old manual munging format", + ) sub_journal_metadata.set_defaults( func=run_journal_metadata, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) - sub_journal_metadata.add_argument('json_file', + sub_journal_metadata.add_argument( + "json_file", help="Journal JSON metadata file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub_chocula = subparsers.add_parser('chocula', - help="import/update container metadata from chocula JSON export") + sub_chocula = subparsers.add_parser( + "chocula", help="import/update container metadata from chocula JSON export" + ) sub_chocula.set_defaults( func=run_chocula, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) - sub_chocula.add_argument('json_file', + sub_chocula.add_argument( + "json_file", help="chocula JSON entities file (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) - sub_chocula.add_argument('--do-updates', - action='store_true', - help="update pre-existing container entities") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_chocula.add_argument( + "--do-updates", action="store_true", help="update pre-existing container entities" + ) - sub_matched = subparsers.add_parser('matched', - help="add file entities matched against existing releases; custom JSON format") + sub_matched = subparsers.add_parser( + "matched", + help="add file entities matched against existing releases; custom JSON format", + ) sub_matched.set_defaults( func=run_matched, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_matched.add_argument('json_file', + sub_matched.add_argument( + "json_file", help="JSON file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) - sub_matched.add_argument('--default-mimetype', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_matched.add_argument( + "--default-mimetype", default=None, - help="default mimetype for imported files (if not specified per-file)") - sub_matched.add_argument('--bezerk-mode', - action='store_true', - help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") - sub_matched.add_argument('--default-link-rel', + help="default mimetype for imported files (if not specified per-file)", + ) + sub_matched.add_argument( + "--bezerk-mode", + action="store_true", + help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)", + ) + sub_matched.add_argument( + "--default-link-rel", default="web", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_arabesque_match = subparsers.add_parser('arabesque', - help="add file entities matched to releases from crawl log analysis") + sub_arabesque_match = subparsers.add_parser( + "arabesque", help="add file entities matched to releases from crawl log analysis" + ) sub_arabesque_match.set_defaults( func=run_arabesque_match, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) - sub_arabesque_match.add_argument('--sqlite-file', - help="sqlite database file to import from") - sub_arabesque_match.add_argument('--json-file', - help="JSON file to import from (or stdin)", - type=argparse.FileType('r')) - sub_arabesque_match.add_argument('--do-updates', - action='store_true', - help="update pre-existing file entities if new match (instead of skipping)") - sub_arabesque_match.add_argument('--no-require-grobid', - action='store_true', - help="whether postproc_status column must be '200'") - sub_arabesque_match.add_argument('--extid-type', + sub_arabesque_match.add_argument( + "--sqlite-file", help="sqlite database file to import from" + ) + sub_arabesque_match.add_argument( + "--json-file", help="JSON file to import from (or stdin)", type=argparse.FileType("r") + ) + sub_arabesque_match.add_argument( + "--do-updates", + action="store_true", + help="update pre-existing file entities if new match (instead of skipping)", + ) + sub_arabesque_match.add_argument( + "--no-require-grobid", + action="store_true", + help="whether postproc_status column must be '200'", + ) + sub_arabesque_match.add_argument( + "--extid-type", default="doi", - help="identifier type in the database (eg, 'doi', 'pmcid'") - sub_arabesque_match.add_argument('--crawl-id', - help="crawl ID (optionally included in editgroup metadata)") - sub_arabesque_match.add_argument('--default-link-rel', + help="identifier type in the database (eg, 'doi', 'pmcid'", + ) + sub_arabesque_match.add_argument( + "--crawl-id", help="crawl ID (optionally included in editgroup metadata)" + ) + sub_arabesque_match.add_argument( + "--default-link-rel", default="web", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_ingest_file = subparsers.add_parser('ingest-file-results', - help="add/update file entities linked to releases based on sandcrawler ingest results") + sub_ingest_file = subparsers.add_parser( + "ingest-file-results", + help="add/update file entities linked to releases based on sandcrawler ingest results", + ) sub_ingest_file.set_defaults( func=run_ingest_file, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) - sub_ingest_file.add_argument('json_file', + sub_ingest_file.add_argument( + "json_file", help="ingest_file JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_ingest_file.add_argument('--skip-source-allowlist', - action='store_true', - help="don't filter import based on request source allowlist") - sub_ingest_file.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_ingest_file.add_argument('--do-updates', - action='store_true', - help="update pre-existing file entities if new match (instead of skipping)") - sub_ingest_file.add_argument('--no-require-grobid', - action='store_true', - help="whether postproc_status column must be '200'") - sub_ingest_file.add_argument('--default-link-rel', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_ingest_file.add_argument( + "--skip-source-allowlist", + action="store_true", + help="don't filter import based on request source allowlist", + ) + sub_ingest_file.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_ingest_file.add_argument( + "--do-updates", + action="store_true", + help="update pre-existing file entities if new match (instead of skipping)", + ) + sub_ingest_file.add_argument( + "--no-require-grobid", + action="store_true", + help="whether postproc_status column must be '200'", + ) + sub_ingest_file.add_argument( + "--default-link-rel", default="web", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_ingest_web = subparsers.add_parser('ingest-web-results', - help="add/update web entities linked to releases based on sandcrawler ingest results") + sub_ingest_web = subparsers.add_parser( + "ingest-web-results", + help="add/update web entities linked to releases based on sandcrawler ingest results", + ) sub_ingest_web.set_defaults( func=run_ingest_web, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) - sub_ingest_web.add_argument('json_file', + sub_ingest_web.add_argument( + "json_file", help="ingest_web JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_ingest_web.add_argument('--skip-source-allowlist', - action='store_true', - help="don't filter import based on request source allowlist") - sub_ingest_web.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_ingest_web.add_argument('--do-updates', - action='store_true', - help="update pre-existing web entities if new match (instead of skipping)") - sub_ingest_web.add_argument('--default-link-rel', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_ingest_web.add_argument( + "--skip-source-allowlist", + action="store_true", + help="don't filter import based on request source allowlist", + ) + sub_ingest_web.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_ingest_web.add_argument( + "--do-updates", + action="store_true", + help="update pre-existing web entities if new match (instead of skipping)", + ) + sub_ingest_web.add_argument( + "--default-link-rel", default="web", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_ingest_fileset = subparsers.add_parser('ingest-fileset-results', - help="add/update fileset entities linked to releases based on sandcrawler ingest results") + sub_ingest_fileset = subparsers.add_parser( + "ingest-fileset-results", + help="add/update fileset entities linked to releases based on sandcrawler ingest results", + ) sub_ingest_fileset.set_defaults( func=run_ingest_fileset, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) - sub_ingest_fileset.add_argument('json_file', + sub_ingest_fileset.add_argument( + "json_file", help="ingest_fileset JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_ingest_fileset.add_argument('--skip-source-allowlist', - action='store_true', - help="don't filter import based on request source allowlist") - sub_ingest_fileset.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_ingest_fileset.add_argument('--do-updates', - action='store_true', - help="update pre-existing fileset entities if new match (instead of skipping)") - sub_ingest_fileset.add_argument('--default-link-rel', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_ingest_fileset.add_argument( + "--skip-source-allowlist", + action="store_true", + help="don't filter import based on request source allowlist", + ) + sub_ingest_fileset.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_ingest_fileset.add_argument( + "--do-updates", + action="store_true", + help="update pre-existing fileset entities if new match (instead of skipping)", + ) + sub_ingest_fileset.add_argument( + "--default-link-rel", default="fileset", - help="default URL rel for matches (eg, 'publisher', 'web')") + help="default URL rel for matches (eg, 'publisher', 'web')", + ) - sub_savepapernow_file = subparsers.add_parser('savepapernow-file-results', - help="add file entities crawled due to async Save Paper Now request") + sub_savepapernow_file = subparsers.add_parser( + "savepapernow-file-results", + help="add file entities crawled due to async Save Paper Now request", + ) sub_savepapernow_file.set_defaults( func=run_savepapernow_file, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) - sub_savepapernow_file.add_argument('json_file', + sub_savepapernow_file.add_argument( + "json_file", help="ingest-file JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_savepapernow_file.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_savepapernow_file.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) - sub_savepapernow_web = subparsers.add_parser('savepapernow-web-results', - help="add webcapture entities crawled due to async Save Paper Now request") + sub_savepapernow_web = subparsers.add_parser( + "savepapernow-web-results", + help="add webcapture entities crawled due to async Save Paper Now request", + ) sub_savepapernow_web.set_defaults( func=run_savepapernow_web, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) - sub_savepapernow_web.add_argument('json_file', + sub_savepapernow_web.add_argument( + "json_file", help="ingest-file JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_savepapernow_web.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_savepapernow_web.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) - sub_savepapernow_fileset = subparsers.add_parser('savepapernow-fileset-results', - help="add fileset entities crawled due to async Save Paper Now request") + sub_savepapernow_fileset = subparsers.add_parser( + "savepapernow-fileset-results", + help="add fileset entities crawled due to async Save Paper Now request", + ) sub_savepapernow_fileset.set_defaults( func=run_savepapernow_fileset, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) - sub_savepapernow_fileset.add_argument('json_file', + sub_savepapernow_fileset.add_argument( + "json_file", help="ingest-file JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_savepapernow_fileset.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_savepapernow_fileset.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) - sub_grobid_metadata = subparsers.add_parser('grobid-metadata', - help="create release and file entities based on GROBID PDF metadata extraction") + sub_grobid_metadata = subparsers.add_parser( + "grobid-metadata", + help="create release and file entities based on GROBID PDF metadata extraction", + ) sub_grobid_metadata.set_defaults( func=run_grobid_metadata, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_grobid_metadata.add_argument('tsv_file', + sub_grobid_metadata.add_argument( + "tsv_file", help="TSV file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) - sub_grobid_metadata.add_argument('--group-size', - help="editgroup group size to use", - default=75, type=int) - sub_grobid_metadata.add_argument('--longtail-oa', - action='store_true', - help="if this is an import of longtail OA content (sets an 'extra' flag)") - sub_grobid_metadata.add_argument('--bezerk-mode', - action='store_true', - help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") - - sub_shadow_lib = subparsers.add_parser('shadow-lib', - help="create release and file entities based on GROBID PDF metadata extraction") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_grobid_metadata.add_argument( + "--group-size", help="editgroup group size to use", default=75, type=int + ) + sub_grobid_metadata.add_argument( + "--longtail-oa", + action="store_true", + help="if this is an import of longtail OA content (sets an 'extra' flag)", + ) + sub_grobid_metadata.add_argument( + "--bezerk-mode", + action="store_true", + help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)", + ) + + sub_shadow_lib = subparsers.add_parser( + "shadow-lib", + help="create release and file entities based on GROBID PDF metadata extraction", + ) sub_shadow_lib.set_defaults( func=run_shadow_lib, auth_var="FATCAT_AUTH_WORKER_SHADOW", ) - sub_shadow_lib.add_argument('json_file', + sub_shadow_lib.add_argument( + "json_file", help="JSON file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub_wayback_static = subparsers.add_parser('wayback-static', - help="crude crawl+ingest tool for single-page HTML docs from wayback") + sub_wayback_static = subparsers.add_parser( + "wayback-static", help="crude crawl+ingest tool for single-page HTML docs from wayback" + ) sub_wayback_static.set_defaults( func=run_wayback_static, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_wayback_static.add_argument('wayback_url', - type=str, - help="URL of wayback capture to extract from") - sub_wayback_static.add_argument('--extid', - type=str, - help="external identifier for release lookup") - sub_wayback_static.add_argument('--release-id', - type=str, - help="release entity identifier") - sub_wayback_static.add_argument('--editgroup-id', + sub_wayback_static.add_argument( + "wayback_url", type=str, help="URL of wayback capture to extract from" + ) + sub_wayback_static.add_argument( + "--extid", type=str, help="external identifier for release lookup" + ) + sub_wayback_static.add_argument("--release-id", type=str, help="release entity identifier") + sub_wayback_static.add_argument( + "--editgroup-id", type=str, - help="use existing editgroup (instead of creating a new one)") + help="use existing editgroup (instead of creating a new one)", + ) - sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat', - help="crude helper to import datasets from Dat/CDL mirror pilot project") + sub_cdl_dash_dat = subparsers.add_parser( + "cdl-dash-dat", help="crude helper to import datasets from Dat/CDL mirror pilot project" + ) sub_cdl_dash_dat.set_defaults( func=run_cdl_dash_dat, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_cdl_dash_dat.add_argument('dat_path', - type=str, - help="local path dat to import (must be the dat discovery key)") - sub_cdl_dash_dat.add_argument('--release-id', - type=str, - help="release entity identifier") - sub_cdl_dash_dat.add_argument('--editgroup-id', + sub_cdl_dash_dat.add_argument( + "dat_path", type=str, help="local path dat to import (must be the dat discovery key)" + ) + sub_cdl_dash_dat.add_argument("--release-id", type=str, help="release entity identifier") + sub_cdl_dash_dat.add_argument( + "--editgroup-id", type=str, - help="use existing editgroup (instead of creating a new one)") + help="use existing editgroup (instead of creating a new one)", + ) - sub_datacite = subparsers.add_parser('datacite', - help="import datacite.org metadata") - sub_datacite.add_argument('json_file', + sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata") + sub_datacite.add_argument( + "json_file", help="File with jsonlines from datacite.org v2 API to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_datacite.add_argument('issn_map_file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_datacite.add_argument( + "issn_map_file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_datacite.add_argument('--extid-map-file', + default=None, + type=argparse.FileType("r"), + ) + sub_datacite.add_argument( + "--extid-map-file", help="DOI-to-other-identifiers sqlite3 database", - default=None, type=str) - sub_datacite.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_datacite.add_argument('--bezerk-mode', - action='store_true', - help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") - sub_datacite.add_argument('--debug', - action='store_true', - help="write converted JSON to stdout") - sub_datacite.add_argument('--insert-log-file', - default='', + default=None, type=str, - help="write inserted documents into file (for debugging)") + ) + sub_datacite.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_datacite.add_argument( + "--bezerk-mode", + action="store_true", + help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)", + ) + sub_datacite.add_argument( + "--debug", action="store_true", help="write converted JSON to stdout" + ) + sub_datacite.add_argument( + "--insert-log-file", + default="", + type=str, + help="write inserted documents into file (for debugging)", + ) sub_datacite.set_defaults( func=run_datacite, auth_var="FATCAT_AUTH_WORKER_DATACITE", ) - sub_doaj_article = subparsers.add_parser('doaj-article', - help="import doaj.org article metadata") - sub_doaj_article.add_argument('json_file', + sub_doaj_article = subparsers.add_parser( + "doaj-article", help="import doaj.org article metadata" + ) + sub_doaj_article.add_argument( + "json_file", help="File with JSON lines from DOAJ API (or bulk dump) to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_doaj_article.add_argument('--issn-map-file', + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_doaj_article.add_argument( + "--issn-map-file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_doaj_article.add_argument('--kafka-mode', - action='store_true', - help="consume from kafka topic (not stdin)") - sub_doaj_article.add_argument('--do-updates', - action='store_true', - help="update any pre-existing release entities") + default=None, + type=argparse.FileType("r"), + ) + sub_doaj_article.add_argument( + "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)" + ) + sub_doaj_article.add_argument( + "--do-updates", action="store_true", help="update any pre-existing release entities" + ) sub_doaj_article.set_defaults( func=run_doaj_article, auth_var="FATCAT_AUTH_WORKER_DOAJ", ) - sub_dblp_release = subparsers.add_parser('dblp-release', - help="import dblp release metadata") - sub_dblp_release.add_argument('xml_file', + sub_dblp_release = subparsers.add_parser( + "dblp-release", help="import dblp release metadata" + ) + sub_dblp_release.add_argument( + "xml_file", help="File with DBLP XML to import from", - default=sys.stdin, type=argparse.FileType('rb')) - sub_dblp_release.add_argument('--dblp-container-map-file', + default=sys.stdin, + type=argparse.FileType("rb"), + ) + sub_dblp_release.add_argument( + "--dblp-container-map-file", help="file path to dblp prefix to container_id TSV file", - default=None, type=argparse.FileType('r')) - sub_dblp_release.add_argument('--do-updates', - action='store_true', - help="update any pre-existing release entities") - sub_dblp_release.add_argument('--dump-json-mode', - action='store_true', - help="print release entities to stdout instead of importing") + default=None, + type=argparse.FileType("r"), + ) + sub_dblp_release.add_argument( + "--do-updates", action="store_true", help="update any pre-existing release entities" + ) + sub_dblp_release.add_argument( + "--dump-json-mode", + action="store_true", + help="print release entities to stdout instead of importing", + ) sub_dblp_release.set_defaults( func=run_dblp_release, auth_var="FATCAT_AUTH_WORKER_DBLP", ) - sub_dblp_container = subparsers.add_parser('dblp-container', - help="import dblp container metadata") - sub_dblp_container.add_argument('json_file', + sub_dblp_container = subparsers.add_parser( + "dblp-container", help="import dblp container metadata" + ) + sub_dblp_container.add_argument( + "json_file", help="File with DBLP container JSON to import from (see extra/dblp/)", - default=sys.stdin, type=argparse.FileType('rb')) - sub_dblp_container.add_argument('--dblp-container-map-file', + default=sys.stdin, + type=argparse.FileType("rb"), + ) + sub_dblp_container.add_argument( + "--dblp-container-map-file", help="file path to dblp pre-existing prefix to container_id TSV file", - default=None, type=argparse.FileType('r')) - sub_dblp_container.add_argument('--dblp-container-map-output', + default=None, + type=argparse.FileType("r"), + ) + sub_dblp_container.add_argument( + "--dblp-container-map-output", help="file path to output new dblp container map TSV to", - default=None, type=argparse.FileType('w')) - sub_dblp_container.add_argument('--issn-map-file', + default=None, + type=argparse.FileType("w"), + ) + sub_dblp_container.add_argument( + "--issn-map-file", help="ISSN to ISSN-L mapping file", - default=None, type=argparse.FileType('r')) - sub_dblp_container.add_argument('--do-updates', - action='store_true', - help="update any pre-existing container entities") + default=None, + type=argparse.FileType("r"), + ) + sub_dblp_container.add_argument( + "--do-updates", action="store_true", help="update any pre-existing container entities" + ) sub_dblp_container.set_defaults( func=run_dblp_container, auth_var="FATCAT_AUTH_WORKER_DBLP", ) - sub_file_meta = subparsers.add_parser('file-meta', - help="simple update-only importer for file metadata") + sub_file_meta = subparsers.add_parser( + "file-meta", help="simple update-only importer for file metadata" + ) sub_file_meta.set_defaults( func=run_file_meta, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_file_meta.add_argument('json_file', + sub_file_meta.add_argument( + "json_file", help="File with jsonlines from file_meta schema to import from", - default=sys.stdin, type=argparse.FileType('r')) + default=sys.stdin, + type=argparse.FileType("r"), + ) - sub_fileset = subparsers.add_parser('fileset', - help="generic fileset importer") + sub_fileset = subparsers.add_parser("fileset", help="generic fileset importer") sub_fileset.set_defaults( func=run_fileset, auth_var="FATCAT_API_AUTH_TOKEN", ) - sub_fileset.add_argument('json_file', + sub_fileset.add_argument( + "json_file", help="File with jsonlines of fileset entities to import", - default=sys.stdin, type=argparse.FileType('r')) - sub_fileset.add_argument('--skip-release-fileset-check', - action='store_true', - help="create without checking if releases already have related filesets") + default=sys.stdin, + type=argparse.FileType("r"), + ) + sub_fileset.add_argument( + "--skip-release-fileset-check", + action="store_true", + help="create without checking if releases already have related filesets", + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -889,15 +1118,18 @@ def main(): # allow editgroup description override via env variable (but CLI arg takes # precedence) - if not args.editgroup_description_override \ - and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'): - args.editgroup_description_override = os.environ.get('FATCAT_EDITGROUP_DESCRIPTION') + if not args.editgroup_description_override and os.environ.get( + "FATCAT_EDITGROUP_DESCRIPTION" + ): + args.editgroup_description_override = os.environ.get("FATCAT_EDITGROUP_DESCRIPTION") args.api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) - token=os.environ.get(args.auth_var)) + token=os.environ.get(args.auth_var), + ) args.func(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() |