diff options
-rw-r--r-- | extra/docker/docker-compose.yml | 13 | ||||
-rwxr-xr-x | python/fatcat_cleanup.py | 10 | ||||
-rwxr-xr-x | python/fatcat_export.py | 10 | ||||
-rwxr-xr-x | python/fatcat_harvest.py | 18 | ||||
-rwxr-xr-x | python/fatcat_import.py | 48 | ||||
-rwxr-xr-x | python/fatcat_review.py | 16 | ||||
-rwxr-xr-x | python/fatcat_transform.py | 22 | ||||
-rwxr-xr-x | python/fatcat_util.py | 22 | ||||
-rwxr-xr-x | python/fatcat_webface.py | 3 | ||||
-rwxr-xr-x | python/fatcat_worker.py | 18 | ||||
-rw-r--r-- | python/shell.py | 6 |
11 files changed, 108 insertions, 78 deletions
diff --git a/extra/docker/docker-compose.yml b/extra/docker/docker-compose.yml index a51d43e1..efc16941 100644 --- a/extra/docker/docker-compose.yml +++ b/extra/docker/docker-compose.yml @@ -17,6 +17,8 @@ services: KAFKA_MESSAGE_MAX_BYTES: 50000000 volumes: - /var/run/docker.sock:/var/run/docker.sock + depends_on: + - zookeeper elasticsearch: build: ../elasticsearch/ ports: @@ -26,4 +28,15 @@ services: cluster.name: "docker-cluster" bootstrap.memory_lock: "true" discovery.type: "single-node" + cluster.routing.allocation.disk.watermark.low: "500mb" + cluster.routing.allocation.disk.watermark.high: "500mb" + cluster.routing.allocation.disk.watermark.flood_stage: "100mb" ES_JAVA_OPTS: "-Xms512m -Xmx512m" + kafka_pixy: + image: mailgun/kafka-pixy:0.17.0 + entrypoint: /usr/bin/kafka-pixy -kafkaPeers kafka:9092 -zookeeperPeers zookeeper:2181 + ports: + - "19091:19091" + - "19092:19092" + depends_on: + - kafka diff --git a/python/fatcat_cleanup.py b/python/fatcat_cleanup.py index 42887299..a7c80965 100755 --- a/python/fatcat_cleanup.py +++ b/python/fatcat_cleanup.py @@ -14,8 +14,9 @@ def run_files(args): JsonLinePusher(fmi, args.json_file).run() def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--host-url', + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--fatcat-api-url', default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument('--batch-size', @@ -29,7 +30,8 @@ def main(): default=False, type=bool) subparsers = parser.add_subparsers() - sub_files = subparsers.add_parser('files') + sub_files = subparsers.add_parser('files', + help="attempt metadata cleanups over a list of file entities") sub_files.set_defaults( func=run_files, auth_var="FATCAT_AUTH_WORKER_CLEANUP", @@ -50,7 +52,7 @@ def main(): args.editgroup_description_override = os.environ.get('FATCAT_EDITGROUP_DESCRIPTION') args.api = authenticated_api( - args.host_url, + args.fatcat_api_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var)) args.func(args) diff --git a/python/fatcat_export.py b/python/fatcat_export.py index 1001dbf6..5419e46c 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -37,11 +37,9 @@ def run_export_changelog(args): json.dumps(entity_to_dict(entry, api_client=args.api.api_client)) + "\n") def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--debug', - action='store_true', - help="enable debugging interface") - parser.add_argument('--host-url', + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--fatcat-api-url', default="http://localhost:9411/v0", help="connect to this host/port") subparsers = parser.add_subparsers() @@ -72,7 +70,7 @@ def main(): print("tell me what to do!") sys.exit(-1) - args.api = public_api(args.host_url) + args.api = public_api(args.fatcat_api_url) args.func(args) if __name__ == '__main__': diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py index 1b92a5fd..58bef9ca 100755 --- a/python/fatcat_harvest.py +++ b/python/fatcat_harvest.py @@ -73,10 +73,8 @@ def mkdate(raw): return datetime.datetime.strptime(raw, "%Y-%m-%d").date() def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--debug', - action='store_true', - help="enable debug logging") + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--kafka-hosts', default="localhost:9092", help="list of Kafka brokers (host/port) to use") @@ -97,16 +95,20 @@ def main(): help="continue harvesting indefinitely in a loop?") subparsers = parser.add_subparsers() - sub_crossref = subparsers.add_parser('crossref') + sub_crossref = subparsers.add_parser('crossref', + help="harvest DOI metadata from Crossref API (JSON)") sub_crossref.set_defaults(func=run_crossref) - sub_datacite = subparsers.add_parser('datacite') + sub_datacite = subparsers.add_parser('datacite', + help="harvest DOI metadata from Datacite API (JSON)") sub_datacite.set_defaults(func=run_datacite) - sub_arxiv = subparsers.add_parser('arxiv') + sub_arxiv = subparsers.add_parser('arxiv', + help="harvest metadata from arxiv.org OAI-PMH endpoint (XML)") sub_arxiv.set_defaults(func=run_arxiv) - sub_pubmed = subparsers.add_parser('pubmed') + sub_pubmed = subparsers.add_parser('pubmed', + help="harvest MEDLINE/PubMed metadata from daily FTP updates (XML)") sub_pubmed.set_defaults(func=run_pubmed) # DOAJ stuff disabled because API range-requests are broken diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 656fe87d..04f58ff7 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -156,10 +156,8 @@ def run_cdl_dash_dat(args): print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--debug', - action='store_true', - help="enable debugging interface") + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--host-url', default="http://localhost:9411/v0", help="connect to this host/port") @@ -177,7 +175,8 @@ def main(): default=None, type=str) subparsers = parser.add_subparsers() - sub_crossref = subparsers.add_parser('crossref') + sub_crossref = subparsers.add_parser('crossref', + help="import Crossref API metadata format (JSON)") sub_crossref.set_defaults( func=run_crossref, auth_var="FATCAT_AUTH_WORKER_CROSSREF", @@ -201,7 +200,8 @@ def main(): action='store_true', help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") - sub_jalc = subparsers.add_parser('jalc') + sub_jalc = subparsers.add_parser('jalc', + help="import JALC DOI metadata from XML dump") sub_jalc.set_defaults( func=run_jalc, auth_var="FATCAT_AUTH_WORKER_JALC", @@ -216,7 +216,8 @@ def main(): help="DOI-to-other-identifiers sqlite3 database", default=None, type=str) - sub_arxiv = subparsers.add_parser('arxiv') + sub_arxiv = subparsers.add_parser('arxiv', + help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( func=run_arxiv, auth_var="FATCAT_AUTH_WORKER_ARXIV", @@ -228,7 +229,8 @@ def main(): action='store_true', help="consume from kafka topic (not stdin)") - sub_pubmed = subparsers.add_parser('pubmed') + sub_pubmed = subparsers.add_parser('pubmed', + help="import MEDLINE/PubMed work-level metadata (XML)") sub_pubmed.set_defaults( func=run_pubmed, auth_var="FATCAT_AUTH_WORKER_PUBMED", @@ -246,7 +248,8 @@ def main(): action='store_true', help="consume from kafka topic (not stdin)") - sub_jstor = subparsers.add_parser('jstor') + sub_jstor = subparsers.add_parser('jstor', + help="import JSTOR work-level metadata from XML dump") sub_jstor.set_defaults( func=run_jstor, auth_var="FATCAT_AUTH_WORKER_JSTOR", @@ -258,7 +261,8 @@ def main(): help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) - sub_orcid = subparsers.add_parser('orcid') + sub_orcid = subparsers.add_parser('orcid', + help="import creator entities from ORCID XML dump") sub_orcid.set_defaults( func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID" @@ -267,7 +271,8 @@ def main(): help="orcid JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) - sub_journal_metadata = subparsers.add_parser('journal-metadata') + sub_journal_metadata = subparsers.add_parser('journal-metadata', + help="import/update container metadata from old manual munging format") sub_journal_metadata.set_defaults( func=run_journal_metadata, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", @@ -276,7 +281,8 @@ def main(): help="Journal JSON metadata file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) - sub_chocula = subparsers.add_parser('chocula') + sub_chocula = subparsers.add_parser('chocula', + help="import/update container metadata from chocula JSON export") sub_chocula.set_defaults( func=run_chocula, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", @@ -285,7 +291,8 @@ def main(): help="chocula JSON entities file (or stdin)", default=sys.stdin, type=argparse.FileType('r')) - sub_matched = subparsers.add_parser('matched') + sub_matched = subparsers.add_parser('matched', + help="add file entities matched against existing releases; custom JSON format") sub_matched.set_defaults( func=run_matched, auth_var="FATCAT_API_AUTH_TOKEN", @@ -303,7 +310,8 @@ def main(): default="web", help="default URL rel for matches (eg, 'publisher', 'web')") - sub_arabesque_match = subparsers.add_parser('arabesque') + sub_arabesque_match = subparsers.add_parser('arabesque', + help="add file entities matched to releases from crawl log analysis") sub_arabesque_match.set_defaults( func=run_arabesque_match, auth_var="FATCAT_AUTH_WORKER_CRAWL", @@ -328,7 +336,8 @@ def main(): default="web", help="default URL rel for matches (eg, 'publisher', 'web')") - sub_ingest_file = subparsers.add_parser('ingest-file-results') + sub_ingest_file = subparsers.add_parser('ingest-file-results', + help="add/update flie entities linked to releases based on sandcrawler ingest results") sub_ingest_file.set_defaults( func=run_ingest_file, auth_var="FATCAT_AUTH_WORKER_CRAWL", @@ -352,7 +361,8 @@ def main(): default="web", help="default URL rel for matches (eg, 'publisher', 'web')") - sub_grobid_metadata = subparsers.add_parser('grobid-metadata') + sub_grobid_metadata = subparsers.add_parser('grobid-metadata', + help="create release and file entities based on GROBID PDF metadata extraction") sub_grobid_metadata.set_defaults( func=run_grobid_metadata, auth_var="FATCAT_API_AUTH_TOKEN", @@ -370,7 +380,8 @@ def main(): action='store_true', help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") - sub_wayback_static = subparsers.add_parser('wayback-static') + sub_wayback_static = subparsers.add_parser('wayback-static', + help="crude crawl+ingest tool for single-page HTML docs from wayback") sub_wayback_static.set_defaults( func=run_wayback_static, auth_var="FATCAT_API_AUTH_TOKEN", @@ -388,7 +399,8 @@ def main(): type=str, help="use existing editgroup (instead of creating a new one)") - sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat') + sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat', + help="crude helper to import datasets from Dat/CDL mirror pilot project") sub_cdl_dash_dat.set_defaults( func=run_cdl_dash_dat, auth_var="FATCAT_API_AUTH_TOKEN", diff --git a/python/fatcat_review.py b/python/fatcat_review.py index 40bc7041..1d1db9a5 100755 --- a/python/fatcat_review.py +++ b/python/fatcat_review.py @@ -14,7 +14,7 @@ sentry_client = raven.Client() def run_dummy(args): reviewer = DummyReviewBot(args.api, poll_interval=args.poll_interval, - verbose=args.debug) + verbose=args.verbose) if args.editgroup: annotation = reviewer.run_single(args.editgroup, args.annotate) print(annotation) @@ -22,11 +22,12 @@ def run_dummy(args): reviewer.run() def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--debug', + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--verbose', action='store_true', - help="enable debug logging") - parser.add_argument('--api-host-url', + help="enable verbose output") + parser.add_argument('--fatcat-api-url', default="http://localhost:9411/v0", help="fatcat API host/port to use") parser.add_argument('--poll-interval', @@ -34,7 +35,8 @@ def main(): default=10.0, type=float) subparsers = parser.add_subparsers() - sub_dummy = subparsers.add_parser('dummy') + sub_dummy = subparsers.add_parser('dummy', + help="example/demonstration review bot") sub_dummy.set_defaults(func=run_dummy) sub_dummy.add_argument("--continuous", action="store_true", @@ -53,7 +55,7 @@ def main(): print("need to run on a single editgroup, or continuous") sys.exit(-1) - args.api = authenticated_api(args.api_host_url) + args.api = authenticated_api(args.fatcat_api_url) args.func(args) if __name__ == '__main__': diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py index 3f90337e..ccb13871 100755 --- a/python/fatcat_transform.py +++ b/python/fatcat_transform.py @@ -60,16 +60,15 @@ def run_citeproc_releases(args): args.json_output.write(out + "\n") def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--debug', - action='store_true', - help="enable debugging interface") - parser.add_argument('--host-url', + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--fatcat-api-url', default="http://localhost:9411/v0", help="connect to this host/port") subparsers = parser.add_subparsers() - sub_elasticsearch_releases = subparsers.add_parser('elasticsearch-releases') + sub_elasticsearch_releases = subparsers.add_parser('elasticsearch-releases', + help="convert fatcat release JSON schema to elasticsearch release schema") sub_elasticsearch_releases.set_defaults(func=run_elasticsearch_releases) sub_elasticsearch_releases.add_argument('json_input', help="JSON-per-line of release entities", @@ -78,7 +77,8 @@ def main(): help="where to send output", default=sys.stdout, type=argparse.FileType('w')) - sub_elasticsearch_containers = subparsers.add_parser('elasticsearch-containers') + sub_elasticsearch_containers = subparsers.add_parser('elasticsearch-containers', + help="convert fatcat container JSON schema to elasticsearch container schema") sub_elasticsearch_containers.set_defaults(func=run_elasticsearch_containers) sub_elasticsearch_containers.add_argument('json_input', help="JSON-per-line of container entities", @@ -87,7 +87,8 @@ def main(): help="where to send output", default=sys.stdout, type=argparse.FileType('w')) - sub_elasticsearch_changelogs = subparsers.add_parser('elasticsearch-changelogs') + sub_elasticsearch_changelogs = subparsers.add_parser('elasticsearch-changelogs', + help="convert fatcat changelog JSON schema to elasticsearch changelog schema") sub_elasticsearch_changelogs.set_defaults(func=run_elasticsearch_changelogs) sub_elasticsearch_changelogs.add_argument('json_input', help="JSON-per-line of changelog entries", @@ -96,7 +97,8 @@ def main(): help="where to send output", default=sys.stdout, type=argparse.FileType('w')) - sub_citeproc_releases = subparsers.add_parser('citeproc-releases') + sub_citeproc_releases = subparsers.add_parser('citeproc-releases', + help="convert fatcat release schema to any standard citation format using citeproc/CSL") sub_citeproc_releases.set_defaults(func=run_citeproc_releases) sub_citeproc_releases.add_argument('json_input', help="JSON-per-line of release entities", @@ -116,7 +118,7 @@ def main(): print("tell me what to do!") sys.exit(-1) - args.api = public_api(args.host_url) + args.api = public_api(args.fatcat_api_url) args.func(args) if __name__ == '__main__': diff --git a/python/fatcat_util.py b/python/fatcat_util.py index 08224dce..d6e76697 100755 --- a/python/fatcat_util.py +++ b/python/fatcat_util.py @@ -32,31 +32,33 @@ def run_editgroup_submit(args): args.api.update_editgroup(args.editgroup_id, eg, submit=True) def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--debug', - action='store_true', - help="enable debugging interface") - parser.add_argument('--host-url', + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--fatcat-api-url', default="http://localhost:9411/v0", help="connect to this host/port") subparsers = parser.add_subparsers() - sub_uuid2fcid = subparsers.add_parser('uuid2fcid') + sub_uuid2fcid = subparsers.add_parser('uuid2fcid', + help="convert a standard UUID (as string) to fatcat ident format") sub_uuid2fcid.set_defaults(func=run_uuid2fcid) sub_uuid2fcid.add_argument('uuid', help="UUID to transform") - sub_fcid2uuid = subparsers.add_parser('fcid2uuid') + sub_fcid2uuid = subparsers.add_parser('fcid2uuid', + help="convert a fatcat ident string to standard UUID format") sub_fcid2uuid.set_defaults(func=run_fcid2uuid) sub_fcid2uuid.add_argument('fcid', help="FCID to transform (into UUID)") - sub_editgroup_accept = subparsers.add_parser('editgroup-accept') + sub_editgroup_accept = subparsers.add_parser('editgroup-accept', + help="accept an editgroup (by ident)") sub_editgroup_accept.set_defaults(func=run_editgroup_accept) sub_editgroup_accept.add_argument('editgroup_id', help="editgroup to accept") - sub_editgroup_submit = subparsers.add_parser('editgroup-submit') + sub_editgroup_submit = subparsers.add_parser('editgroup-submit', + help="submit an editgroup for review (by ident)") sub_editgroup_submit.set_defaults(func=run_editgroup_submit) sub_editgroup_submit.add_argument('editgroup_id', help="editgroup to submit") @@ -66,7 +68,7 @@ def main(): print("tell me what to do!") sys.exit(-1) - args.api = authenticated_api(args.host_url) + args.api = authenticated_api(args.fatcat_api_url) args.func(args) if __name__ == '__main__': diff --git a/python/fatcat_webface.py b/python/fatcat_webface.py index c8a39bb9..94dce9ca 100755 --- a/python/fatcat_webface.py +++ b/python/fatcat_webface.py @@ -4,7 +4,8 @@ import argparse from fatcat_web import app def main(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--debug', action='store_true', help="enable debugging interface (note: not for everything)") diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index c2120bae..bfb87a72 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -48,10 +48,8 @@ def run_elasticsearch_container(args): worker.run() def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--debug', - action='store_true', - help="enable debug logging") + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--api-host-url', default="http://localhost:9411/v0", help="fatcat API host/port to use") @@ -63,16 +61,19 @@ def main(): help="Kafka topic namespace to use (eg, prod, qa, dev)") subparsers = parser.add_subparsers() - sub_changelog = subparsers.add_parser('changelog') + sub_changelog = subparsers.add_parser('changelog', + help="poll fatcat API for changelog entries, push to kafka") sub_changelog.set_defaults(func=run_changelog) sub_changelog.add_argument('--poll-interval', help="how long to wait between polling (seconds)", default=5.0, type=float) - sub_entity_updates = subparsers.add_parser('entity-updates') + sub_entity_updates = subparsers.add_parser('entity-updates', + help="poll kafka for changelog entries; push entity changes to various kafka topics") sub_entity_updates.set_defaults(func=run_entity_updates) - sub_elasticsearch_release = subparsers.add_parser('elasticsearch-release') + sub_elasticsearch_release = subparsers.add_parser('elasticsearch-release', + help="consume kafka feed of new/updated releases, transform and push to search") sub_elasticsearch_release.set_defaults(func=run_elasticsearch_release) sub_elasticsearch_release.add_argument('--elasticsearch-backend', help="elasticsearch backend to connect to", @@ -81,7 +82,8 @@ def main(): help="elasticsearch index to push into", default="fatcat_release_v03") - sub_elasticsearch_container = subparsers.add_parser('elasticsearch-container') + sub_elasticsearch_container = subparsers.add_parser('elasticsearch-container', + help="consume kafka feed of new/updated containers, transform and push to search") sub_elasticsearch_container.set_defaults(func=run_elasticsearch_container) sub_elasticsearch_container.add_argument('--elasticsearch-backend', help="elasticsearch backend to connect to", diff --git a/python/shell.py b/python/shell.py index 436ea7b1..c207a325 100644 --- a/python/shell.py +++ b/python/shell.py @@ -11,12 +11,6 @@ from fatcat_openapi_client.rest import ApiException from fatcat_tools import * if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--debug', - action='store_true', - help="enable debugging interface") - - args = parser.parse_args() #api = print(" __ _ _ _ ") |