summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/docker/docker-compose.yml13
-rwxr-xr-xpython/fatcat_cleanup.py10
-rwxr-xr-xpython/fatcat_export.py10
-rwxr-xr-xpython/fatcat_harvest.py18
-rwxr-xr-xpython/fatcat_import.py48
-rwxr-xr-xpython/fatcat_review.py16
-rwxr-xr-xpython/fatcat_transform.py22
-rwxr-xr-xpython/fatcat_util.py22
-rwxr-xr-xpython/fatcat_webface.py3
-rwxr-xr-xpython/fatcat_worker.py18
-rw-r--r--python/shell.py6
11 files changed, 108 insertions, 78 deletions
diff --git a/extra/docker/docker-compose.yml b/extra/docker/docker-compose.yml
index a51d43e1..efc16941 100644
--- a/extra/docker/docker-compose.yml
+++ b/extra/docker/docker-compose.yml
@@ -17,6 +17,8 @@ services:
KAFKA_MESSAGE_MAX_BYTES: 50000000
volumes:
- /var/run/docker.sock:/var/run/docker.sock
+ depends_on:
+ - zookeeper
elasticsearch:
build: ../elasticsearch/
ports:
@@ -26,4 +28,15 @@ services:
cluster.name: "docker-cluster"
bootstrap.memory_lock: "true"
discovery.type: "single-node"
+ cluster.routing.allocation.disk.watermark.low: "500mb"
+ cluster.routing.allocation.disk.watermark.high: "500mb"
+ cluster.routing.allocation.disk.watermark.flood_stage: "100mb"
ES_JAVA_OPTS: "-Xms512m -Xmx512m"
+ kafka_pixy:
+ image: mailgun/kafka-pixy:0.17.0
+ entrypoint: /usr/bin/kafka-pixy -kafkaPeers kafka:9092 -zookeeperPeers zookeeper:2181
+ ports:
+ - "19091:19091"
+ - "19092:19092"
+ depends_on:
+ - kafka
diff --git a/python/fatcat_cleanup.py b/python/fatcat_cleanup.py
index 42887299..a7c80965 100755
--- a/python/fatcat_cleanup.py
+++ b/python/fatcat_cleanup.py
@@ -14,8 +14,9 @@ def run_files(args):
JsonLinePusher(fmi, args.json_file).run()
def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--host-url',
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--fatcat-api-url',
default="http://localhost:9411/v0",
help="connect to this host/port")
parser.add_argument('--batch-size',
@@ -29,7 +30,8 @@ def main():
default=False, type=bool)
subparsers = parser.add_subparsers()
- sub_files = subparsers.add_parser('files')
+ sub_files = subparsers.add_parser('files',
+ help="attempt metadata cleanups over a list of file entities")
sub_files.set_defaults(
func=run_files,
auth_var="FATCAT_AUTH_WORKER_CLEANUP",
@@ -50,7 +52,7 @@ def main():
args.editgroup_description_override = os.environ.get('FATCAT_EDITGROUP_DESCRIPTION')
args.api = authenticated_api(
- args.host_url,
+ args.fatcat_api_url,
# token is an optional kwarg (can be empty string, None, etc)
token=os.environ.get(args.auth_var))
args.func(args)
diff --git a/python/fatcat_export.py b/python/fatcat_export.py
index 1001dbf6..5419e46c 100755
--- a/python/fatcat_export.py
+++ b/python/fatcat_export.py
@@ -37,11 +37,9 @@ def run_export_changelog(args):
json.dumps(entity_to_dict(entry, api_client=args.api.api_client)) + "\n")
def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--debug',
- action='store_true',
- help="enable debugging interface")
- parser.add_argument('--host-url',
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--fatcat-api-url',
default="http://localhost:9411/v0",
help="connect to this host/port")
subparsers = parser.add_subparsers()
@@ -72,7 +70,7 @@ def main():
print("tell me what to do!")
sys.exit(-1)
- args.api = public_api(args.host_url)
+ args.api = public_api(args.fatcat_api_url)
args.func(args)
if __name__ == '__main__':
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py
index 1b92a5fd..58bef9ca 100755
--- a/python/fatcat_harvest.py
+++ b/python/fatcat_harvest.py
@@ -73,10 +73,8 @@ def mkdate(raw):
return datetime.datetime.strptime(raw, "%Y-%m-%d").date()
def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--debug',
- action='store_true',
- help="enable debug logging")
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--kafka-hosts',
default="localhost:9092",
help="list of Kafka brokers (host/port) to use")
@@ -97,16 +95,20 @@ def main():
help="continue harvesting indefinitely in a loop?")
subparsers = parser.add_subparsers()
- sub_crossref = subparsers.add_parser('crossref')
+ sub_crossref = subparsers.add_parser('crossref',
+ help="harvest DOI metadata from Crossref API (JSON)")
sub_crossref.set_defaults(func=run_crossref)
- sub_datacite = subparsers.add_parser('datacite')
+ sub_datacite = subparsers.add_parser('datacite',
+ help="harvest DOI metadata from Datacite API (JSON)")
sub_datacite.set_defaults(func=run_datacite)
- sub_arxiv = subparsers.add_parser('arxiv')
+ sub_arxiv = subparsers.add_parser('arxiv',
+ help="harvest metadata from arxiv.org OAI-PMH endpoint (XML)")
sub_arxiv.set_defaults(func=run_arxiv)
- sub_pubmed = subparsers.add_parser('pubmed')
+ sub_pubmed = subparsers.add_parser('pubmed',
+ help="harvest MEDLINE/PubMed metadata from daily FTP updates (XML)")
sub_pubmed.set_defaults(func=run_pubmed)
# DOAJ stuff disabled because API range-requests are broken
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 656fe87d..04f58ff7 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -156,10 +156,8 @@ def run_cdl_dash_dat(args):
print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))
def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--debug',
- action='store_true',
- help="enable debugging interface")
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--host-url',
default="http://localhost:9411/v0",
help="connect to this host/port")
@@ -177,7 +175,8 @@ def main():
default=None, type=str)
subparsers = parser.add_subparsers()
- sub_crossref = subparsers.add_parser('crossref')
+ sub_crossref = subparsers.add_parser('crossref',
+ help="import Crossref API metadata format (JSON)")
sub_crossref.set_defaults(
func=run_crossref,
auth_var="FATCAT_AUTH_WORKER_CROSSREF",
@@ -201,7 +200,8 @@ def main():
action='store_true',
help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
- sub_jalc = subparsers.add_parser('jalc')
+ sub_jalc = subparsers.add_parser('jalc',
+ help="import JALC DOI metadata from XML dump")
sub_jalc.set_defaults(
func=run_jalc,
auth_var="FATCAT_AUTH_WORKER_JALC",
@@ -216,7 +216,8 @@ def main():
help="DOI-to-other-identifiers sqlite3 database",
default=None, type=str)
- sub_arxiv = subparsers.add_parser('arxiv')
+ sub_arxiv = subparsers.add_parser('arxiv',
+ help="import arxiv.org metadata from XML files")
sub_arxiv.set_defaults(
func=run_arxiv,
auth_var="FATCAT_AUTH_WORKER_ARXIV",
@@ -228,7 +229,8 @@ def main():
action='store_true',
help="consume from kafka topic (not stdin)")
- sub_pubmed = subparsers.add_parser('pubmed')
+ sub_pubmed = subparsers.add_parser('pubmed',
+ help="import MEDLINE/PubMed work-level metadata (XML)")
sub_pubmed.set_defaults(
func=run_pubmed,
auth_var="FATCAT_AUTH_WORKER_PUBMED",
@@ -246,7 +248,8 @@ def main():
action='store_true',
help="consume from kafka topic (not stdin)")
- sub_jstor = subparsers.add_parser('jstor')
+ sub_jstor = subparsers.add_parser('jstor',
+ help="import JSTOR work-level metadata from XML dump")
sub_jstor.set_defaults(
func=run_jstor,
auth_var="FATCAT_AUTH_WORKER_JSTOR",
@@ -258,7 +261,8 @@ def main():
help="ISSN to ISSN-L mapping file",
default=None, type=argparse.FileType('r'))
- sub_orcid = subparsers.add_parser('orcid')
+ sub_orcid = subparsers.add_parser('orcid',
+ help="import creator entities from ORCID XML dump")
sub_orcid.set_defaults(
func=run_orcid,
auth_var="FATCAT_AUTH_WORKER_ORCID"
@@ -267,7 +271,8 @@ def main():
help="orcid JSON file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_journal_metadata = subparsers.add_parser('journal-metadata')
+ sub_journal_metadata = subparsers.add_parser('journal-metadata',
+ help="import/update container metadata from old manual munging format")
sub_journal_metadata.set_defaults(
func=run_journal_metadata,
auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
@@ -276,7 +281,8 @@ def main():
help="Journal JSON metadata file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_chocula = subparsers.add_parser('chocula')
+ sub_chocula = subparsers.add_parser('chocula',
+ help="import/update container metadata from chocula JSON export")
sub_chocula.set_defaults(
func=run_chocula,
auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
@@ -285,7 +291,8 @@ def main():
help="chocula JSON entities file (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_matched = subparsers.add_parser('matched')
+ sub_matched = subparsers.add_parser('matched',
+ help="add file entities matched against existing releases; custom JSON format")
sub_matched.set_defaults(
func=run_matched,
auth_var="FATCAT_API_AUTH_TOKEN",
@@ -303,7 +310,8 @@ def main():
default="web",
help="default URL rel for matches (eg, 'publisher', 'web')")
- sub_arabesque_match = subparsers.add_parser('arabesque')
+ sub_arabesque_match = subparsers.add_parser('arabesque',
+ help="add file entities matched to releases from crawl log analysis")
sub_arabesque_match.set_defaults(
func=run_arabesque_match,
auth_var="FATCAT_AUTH_WORKER_CRAWL",
@@ -328,7 +336,8 @@ def main():
default="web",
help="default URL rel for matches (eg, 'publisher', 'web')")
- sub_ingest_file = subparsers.add_parser('ingest-file-results')
+ sub_ingest_file = subparsers.add_parser('ingest-file-results',
+ help="add/update flie entities linked to releases based on sandcrawler ingest results")
sub_ingest_file.set_defaults(
func=run_ingest_file,
auth_var="FATCAT_AUTH_WORKER_CRAWL",
@@ -352,7 +361,8 @@ def main():
default="web",
help="default URL rel for matches (eg, 'publisher', 'web')")
- sub_grobid_metadata = subparsers.add_parser('grobid-metadata')
+ sub_grobid_metadata = subparsers.add_parser('grobid-metadata',
+ help="create release and file entities based on GROBID PDF metadata extraction")
sub_grobid_metadata.set_defaults(
func=run_grobid_metadata,
auth_var="FATCAT_API_AUTH_TOKEN",
@@ -370,7 +380,8 @@ def main():
action='store_true',
help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
- sub_wayback_static = subparsers.add_parser('wayback-static')
+ sub_wayback_static = subparsers.add_parser('wayback-static',
+ help="crude crawl+ingest tool for single-page HTML docs from wayback")
sub_wayback_static.set_defaults(
func=run_wayback_static,
auth_var="FATCAT_API_AUTH_TOKEN",
@@ -388,7 +399,8 @@ def main():
type=str,
help="use existing editgroup (instead of creating a new one)")
- sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat')
+ sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat',
+ help="crude helper to import datasets from Dat/CDL mirror pilot project")
sub_cdl_dash_dat.set_defaults(
func=run_cdl_dash_dat,
auth_var="FATCAT_API_AUTH_TOKEN",
diff --git a/python/fatcat_review.py b/python/fatcat_review.py
index 40bc7041..1d1db9a5 100755
--- a/python/fatcat_review.py
+++ b/python/fatcat_review.py
@@ -14,7 +14,7 @@ sentry_client = raven.Client()
def run_dummy(args):
reviewer = DummyReviewBot(args.api, poll_interval=args.poll_interval,
- verbose=args.debug)
+ verbose=args.verbose)
if args.editgroup:
annotation = reviewer.run_single(args.editgroup, args.annotate)
print(annotation)
@@ -22,11 +22,12 @@ def run_dummy(args):
reviewer.run()
def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--debug',
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--verbose',
action='store_true',
- help="enable debug logging")
- parser.add_argument('--api-host-url',
+ help="enable verbose output")
+ parser.add_argument('--fatcat-api-url',
default="http://localhost:9411/v0",
help="fatcat API host/port to use")
parser.add_argument('--poll-interval',
@@ -34,7 +35,8 @@ def main():
default=10.0, type=float)
subparsers = parser.add_subparsers()
- sub_dummy = subparsers.add_parser('dummy')
+ sub_dummy = subparsers.add_parser('dummy',
+ help="example/demonstration review bot")
sub_dummy.set_defaults(func=run_dummy)
sub_dummy.add_argument("--continuous",
action="store_true",
@@ -53,7 +55,7 @@ def main():
print("need to run on a single editgroup, or continuous")
sys.exit(-1)
- args.api = authenticated_api(args.api_host_url)
+ args.api = authenticated_api(args.fatcat_api_url)
args.func(args)
if __name__ == '__main__':
diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py
index 3f90337e..ccb13871 100755
--- a/python/fatcat_transform.py
+++ b/python/fatcat_transform.py
@@ -60,16 +60,15 @@ def run_citeproc_releases(args):
args.json_output.write(out + "\n")
def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--debug',
- action='store_true',
- help="enable debugging interface")
- parser.add_argument('--host-url',
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--fatcat-api-url',
default="http://localhost:9411/v0",
help="connect to this host/port")
subparsers = parser.add_subparsers()
- sub_elasticsearch_releases = subparsers.add_parser('elasticsearch-releases')
+ sub_elasticsearch_releases = subparsers.add_parser('elasticsearch-releases',
+ help="convert fatcat release JSON schema to elasticsearch release schema")
sub_elasticsearch_releases.set_defaults(func=run_elasticsearch_releases)
sub_elasticsearch_releases.add_argument('json_input',
help="JSON-per-line of release entities",
@@ -78,7 +77,8 @@ def main():
help="where to send output",
default=sys.stdout, type=argparse.FileType('w'))
- sub_elasticsearch_containers = subparsers.add_parser('elasticsearch-containers')
+ sub_elasticsearch_containers = subparsers.add_parser('elasticsearch-containers',
+ help="convert fatcat container JSON schema to elasticsearch container schema")
sub_elasticsearch_containers.set_defaults(func=run_elasticsearch_containers)
sub_elasticsearch_containers.add_argument('json_input',
help="JSON-per-line of container entities",
@@ -87,7 +87,8 @@ def main():
help="where to send output",
default=sys.stdout, type=argparse.FileType('w'))
- sub_elasticsearch_changelogs = subparsers.add_parser('elasticsearch-changelogs')
+ sub_elasticsearch_changelogs = subparsers.add_parser('elasticsearch-changelogs',
+ help="convert fatcat changelog JSON schema to elasticsearch changelog schema")
sub_elasticsearch_changelogs.set_defaults(func=run_elasticsearch_changelogs)
sub_elasticsearch_changelogs.add_argument('json_input',
help="JSON-per-line of changelog entries",
@@ -96,7 +97,8 @@ def main():
help="where to send output",
default=sys.stdout, type=argparse.FileType('w'))
- sub_citeproc_releases = subparsers.add_parser('citeproc-releases')
+ sub_citeproc_releases = subparsers.add_parser('citeproc-releases',
+ help="convert fatcat release schema to any standard citation format using citeproc/CSL")
sub_citeproc_releases.set_defaults(func=run_citeproc_releases)
sub_citeproc_releases.add_argument('json_input',
help="JSON-per-line of release entities",
@@ -116,7 +118,7 @@ def main():
print("tell me what to do!")
sys.exit(-1)
- args.api = public_api(args.host_url)
+ args.api = public_api(args.fatcat_api_url)
args.func(args)
if __name__ == '__main__':
diff --git a/python/fatcat_util.py b/python/fatcat_util.py
index 08224dce..d6e76697 100755
--- a/python/fatcat_util.py
+++ b/python/fatcat_util.py
@@ -32,31 +32,33 @@ def run_editgroup_submit(args):
args.api.update_editgroup(args.editgroup_id, eg, submit=True)
def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--debug',
- action='store_true',
- help="enable debugging interface")
- parser.add_argument('--host-url',
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--fatcat-api-url',
default="http://localhost:9411/v0",
help="connect to this host/port")
subparsers = parser.add_subparsers()
- sub_uuid2fcid = subparsers.add_parser('uuid2fcid')
+ sub_uuid2fcid = subparsers.add_parser('uuid2fcid',
+ help="convert a standard UUID (as string) to fatcat ident format")
sub_uuid2fcid.set_defaults(func=run_uuid2fcid)
sub_uuid2fcid.add_argument('uuid',
help="UUID to transform")
- sub_fcid2uuid = subparsers.add_parser('fcid2uuid')
+ sub_fcid2uuid = subparsers.add_parser('fcid2uuid',
+ help="convert a fatcat ident string to standard UUID format")
sub_fcid2uuid.set_defaults(func=run_fcid2uuid)
sub_fcid2uuid.add_argument('fcid',
help="FCID to transform (into UUID)")
- sub_editgroup_accept = subparsers.add_parser('editgroup-accept')
+ sub_editgroup_accept = subparsers.add_parser('editgroup-accept',
+ help="accept an editgroup (by ident)")
sub_editgroup_accept.set_defaults(func=run_editgroup_accept)
sub_editgroup_accept.add_argument('editgroup_id',
help="editgroup to accept")
- sub_editgroup_submit = subparsers.add_parser('editgroup-submit')
+ sub_editgroup_submit = subparsers.add_parser('editgroup-submit',
+ help="submit an editgroup for review (by ident)")
sub_editgroup_submit.set_defaults(func=run_editgroup_submit)
sub_editgroup_submit.add_argument('editgroup_id',
help="editgroup to submit")
@@ -66,7 +68,7 @@ def main():
print("tell me what to do!")
sys.exit(-1)
- args.api = authenticated_api(args.host_url)
+ args.api = authenticated_api(args.fatcat_api_url)
args.func(args)
if __name__ == '__main__':
diff --git a/python/fatcat_webface.py b/python/fatcat_webface.py
index c8a39bb9..94dce9ca 100755
--- a/python/fatcat_webface.py
+++ b/python/fatcat_webface.py
@@ -4,7 +4,8 @@ import argparse
from fatcat_web import app
def main():
- parser = argparse.ArgumentParser()
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--debug',
action='store_true',
help="enable debugging interface (note: not for everything)")
diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py
index c2120bae..bfb87a72 100755
--- a/python/fatcat_worker.py
+++ b/python/fatcat_worker.py
@@ -48,10 +48,8 @@ def run_elasticsearch_container(args):
worker.run()
def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--debug',
- action='store_true',
- help="enable debug logging")
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--api-host-url',
default="http://localhost:9411/v0",
help="fatcat API host/port to use")
@@ -63,16 +61,19 @@ def main():
help="Kafka topic namespace to use (eg, prod, qa, dev)")
subparsers = parser.add_subparsers()
- sub_changelog = subparsers.add_parser('changelog')
+ sub_changelog = subparsers.add_parser('changelog',
+ help="poll fatcat API for changelog entries, push to kafka")
sub_changelog.set_defaults(func=run_changelog)
sub_changelog.add_argument('--poll-interval',
help="how long to wait between polling (seconds)",
default=5.0, type=float)
- sub_entity_updates = subparsers.add_parser('entity-updates')
+ sub_entity_updates = subparsers.add_parser('entity-updates',
+ help="poll kafka for changelog entries; push entity changes to various kafka topics")
sub_entity_updates.set_defaults(func=run_entity_updates)
- sub_elasticsearch_release = subparsers.add_parser('elasticsearch-release')
+ sub_elasticsearch_release = subparsers.add_parser('elasticsearch-release',
+ help="consume kafka feed of new/updated releases, transform and push to search")
sub_elasticsearch_release.set_defaults(func=run_elasticsearch_release)
sub_elasticsearch_release.add_argument('--elasticsearch-backend',
help="elasticsearch backend to connect to",
@@ -81,7 +82,8 @@ def main():
help="elasticsearch index to push into",
default="fatcat_release_v03")
- sub_elasticsearch_container = subparsers.add_parser('elasticsearch-container')
+ sub_elasticsearch_container = subparsers.add_parser('elasticsearch-container',
+ help="consume kafka feed of new/updated containers, transform and push to search")
sub_elasticsearch_container.set_defaults(func=run_elasticsearch_container)
sub_elasticsearch_container.add_argument('--elasticsearch-backend',
help="elasticsearch backend to connect to",
diff --git a/python/shell.py b/python/shell.py
index 436ea7b1..c207a325 100644
--- a/python/shell.py
+++ b/python/shell.py
@@ -11,12 +11,6 @@ from fatcat_openapi_client.rest import ApiException
from fatcat_tools import *
if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument('--debug',
- action='store_true',
- help="enable debugging interface")
-
- args = parser.parse_args()
#api =
print(" __ _ _ _ ")