From 87099999ebf58b31e2fecd1e3b57bf6712f08b76 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 26 Jul 2018 01:04:48 -0700 Subject: rename python scripts --- python/README.md | 2 +- python/README_import.md | 20 +++++------ python/client.py | 94 ------------------------------------------------ python/fatcat_import.py | 94 ++++++++++++++++++++++++++++++++++++++++++++++++ python/fatcat_webface.py | 37 +++++++++++++++++++ python/run.py | 37 ------------------- 6 files changed, 142 insertions(+), 142 deletions(-) delete mode 100755 python/client.py create mode 100755 python/fatcat_import.py create mode 100755 python/fatcat_webface.py delete mode 100755 python/run.py diff --git a/python/README.md b/python/README.md index c7e33f0a..eebbbd9c 100644 --- a/python/README.md +++ b/python/README.md @@ -3,7 +3,7 @@ Use `pipenv` (which you can install with `pip`). - pipenv run run.py + pipenv run fatcat_webface.py Run tests: diff --git a/python/README_import.md b/python/README_import.md index ae9764e6..38c8406f 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -24,7 +24,7 @@ the others: From CSV file: export LC_ALL=C.UTF-8 - time ./client.py import-issn /srv/datasets/journal_extra_metadata.csv + time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv real 2m42.148s user 0m11.148s @@ -36,38 +36,38 @@ Pretty quick, a few minutes. Directly from compressed tarball; takes about 2 hours in production: - tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | grep '"person":' | time parallel -j12 --pipe --round-robin ./client.py import-orcid - + tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | grep '"person":' | time parallel -j12 --pipe --round-robin ./fatcat_import.py import-orcid - After tuning database, `jq` CPU seems to be bottleneck, so, from pre-extracted tarball: tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | rg '"person":' > /srv/datasets/public_profiles_1_2_json.all.json - time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./client.py import-orcid - + time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - Does not work: - ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json + ./fatcat_import.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json Instead: - cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./client.py import-orcid - + cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./fatcat_import.py import-orcid - Or for many files: - find /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3 -iname '*.json' | parallel --bar jq -c . {} | rg '"person":' | ./client.py import-orcid - + find /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3 -iname '*.json' | parallel --bar jq -c . {} | rg '"person":' | ./fatcat_import.py import-orcid - ### ORCID Performance for ~9k files: - (python-B2RYrks8) bnewbold@orithena$ time parallel --pipepart -j4 -a /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json ./client.py import-orcid - + (python-B2RYrks8) bnewbold@orithena$ time parallel --pipepart -j4 -a /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json ./fatcat_import.py import-orcid - real 0m15.294s user 0m28.112s sys 0m2.408s => 636/second - (python-B2RYrks8) bnewbold@orithena$ time ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json + (python-B2RYrks8) bnewbold@orithena$ time ./fatcat_import.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json real 0m47.268s user 0m2.616s sys 0m0.104s @@ -94,11 +94,11 @@ After some simple database tuning: From compressed: - xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./client.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt + xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt ## Manifest - time ./client.py import-manifest /srv/datasets/idents_files_urls.sqlite + time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite [...] Finished a batch; row 284518671 of 9669646 (2942.39%). Total inserted: 6606900 diff --git a/python/client.py b/python/client.py deleted file mode 100755 index 2804a210..00000000 --- a/python/client.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import argparse -from fatcat.raw_api_client import RawFatcatApiClient -from fatcat.crossref_importer import FatcatCrossrefImporter -from fatcat.orcid_importer import FatcatOrcidImporter -from fatcat.manifest_importer import FatcatManifestImporter -from fatcat.issn_importer import FatcatIssnImporter - -def run_import_crossref(args): - fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file, - create_containers=(not args.no_create_containers)) - fci.process_batch(args.json_file, size=args.batch_size) - -def run_import_orcid(args): - foi = FatcatOrcidImporter(args.host_url) - foi.process_batch(args.json_file, size=args.batch_size) - -def run_import_issn(args): - fii = FatcatIssnImporter(args.host_url) - fii.process_csv_batch(args.csv_file, size=args.batch_size) - -def run_import_manifest(args): - fmi = FatcatManifestImporter(args.host_url) - fmi.process_db(args.db_path, size=args.batch_size) - -def health(args): - rfac = RawFatcatApiClient(args.host_url) - print(rfac.health()) - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--debug', - action='store_true', - help="enable debugging interface") - parser.add_argument('--host-url', - default="http://localhost:9411/v0", - help="connect to this host/port") - subparsers = parser.add_subparsers() - - sub_import_crossref = subparsers.add_parser('import-crossref') - sub_import_crossref.set_defaults(func=run_import_crossref) - sub_import_crossref.add_argument('json_file', - help="crossref JSON file to import from", - default=sys.stdin, type=argparse.FileType('r')) - sub_import_crossref.add_argument('issn_map_file', - help="ISSN to ISSN-L mapping file", - default=sys.stdin, type=argparse.FileType('r')) - sub_import_crossref.add_argument('--no-create-containers', - action='store_true', - help="skip creation of new container entities based on ISSN") - sub_import_crossref.add_argument('--batch-size', - help="size of batch to send", - default=50, type=int) - - sub_import_orcid = subparsers.add_parser('import-orcid') - sub_import_orcid.set_defaults(func=run_import_orcid) - sub_import_orcid.add_argument('json_file', - help="orcid JSON file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) - sub_import_orcid.add_argument('--batch-size', - help="size of batch to send", - default=50, type=int) - - sub_import_issn = subparsers.add_parser('import-issn') - sub_import_issn.set_defaults(func=run_import_issn) - sub_import_issn.add_argument('csv_file', - help="Journal ISSN CSV metadata file to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) - sub_import_issn.add_argument('--batch-size', - help="size of batch to send", - default=50, type=int) - - sub_import_manifest = subparsers.add_parser('import-manifest') - sub_import_manifest.set_defaults(func=run_import_manifest) - sub_import_manifest.add_argument('db_path', - help="sqlite3 database to import from", - type=str) - sub_import_manifest.add_argument('--batch-size', - help="size of batch to send", - default=50, type=int) - - sub_health = subparsers.add_parser('health') - sub_health.set_defaults(func=health) - - args = parser.parse_args() - if not args.__dict__.get("func"): - print("tell me what to do!") - sys.exit(-1) - args.func(args) - -if __name__ == '__main__': - main() diff --git a/python/fatcat_import.py b/python/fatcat_import.py new file mode 100755 index 00000000..2804a210 --- /dev/null +++ b/python/fatcat_import.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +import sys +import argparse +from fatcat.raw_api_client import RawFatcatApiClient +from fatcat.crossref_importer import FatcatCrossrefImporter +from fatcat.orcid_importer import FatcatOrcidImporter +from fatcat.manifest_importer import FatcatManifestImporter +from fatcat.issn_importer import FatcatIssnImporter + +def run_import_crossref(args): + fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file, + create_containers=(not args.no_create_containers)) + fci.process_batch(args.json_file, size=args.batch_size) + +def run_import_orcid(args): + foi = FatcatOrcidImporter(args.host_url) + foi.process_batch(args.json_file, size=args.batch_size) + +def run_import_issn(args): + fii = FatcatIssnImporter(args.host_url) + fii.process_csv_batch(args.csv_file, size=args.batch_size) + +def run_import_manifest(args): + fmi = FatcatManifestImporter(args.host_url) + fmi.process_db(args.db_path, size=args.batch_size) + +def health(args): + rfac = RawFatcatApiClient(args.host_url) + print(rfac.health()) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--debug', + action='store_true', + help="enable debugging interface") + parser.add_argument('--host-url', + default="http://localhost:9411/v0", + help="connect to this host/port") + subparsers = parser.add_subparsers() + + sub_import_crossref = subparsers.add_parser('import-crossref') + sub_import_crossref.set_defaults(func=run_import_crossref) + sub_import_crossref.add_argument('json_file', + help="crossref JSON file to import from", + default=sys.stdin, type=argparse.FileType('r')) + sub_import_crossref.add_argument('issn_map_file', + help="ISSN to ISSN-L mapping file", + default=sys.stdin, type=argparse.FileType('r')) + sub_import_crossref.add_argument('--no-create-containers', + action='store_true', + help="skip creation of new container entities based on ISSN") + sub_import_crossref.add_argument('--batch-size', + help="size of batch to send", + default=50, type=int) + + sub_import_orcid = subparsers.add_parser('import-orcid') + sub_import_orcid.set_defaults(func=run_import_orcid) + sub_import_orcid.add_argument('json_file', + help="orcid JSON file to import from (or stdin)", + default=sys.stdin, type=argparse.FileType('r')) + sub_import_orcid.add_argument('--batch-size', + help="size of batch to send", + default=50, type=int) + + sub_import_issn = subparsers.add_parser('import-issn') + sub_import_issn.set_defaults(func=run_import_issn) + sub_import_issn.add_argument('csv_file', + help="Journal ISSN CSV metadata file to import from (or stdin)", + default=sys.stdin, type=argparse.FileType('r')) + sub_import_issn.add_argument('--batch-size', + help="size of batch to send", + default=50, type=int) + + sub_import_manifest = subparsers.add_parser('import-manifest') + sub_import_manifest.set_defaults(func=run_import_manifest) + sub_import_manifest.add_argument('db_path', + help="sqlite3 database to import from", + type=str) + sub_import_manifest.add_argument('--batch-size', + help="size of batch to send", + default=50, type=int) + + sub_health = subparsers.add_parser('health') + sub_health.set_defaults(func=health) + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!") + sys.exit(-1) + args.func(args) + +if __name__ == '__main__': + main() diff --git a/python/fatcat_webface.py b/python/fatcat_webface.py new file mode 100755 index 00000000..cfddad48 --- /dev/null +++ b/python/fatcat_webface.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +import argparse +from fatcat import app + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--debug', + action='store_true', + help="enable debugging interface (note: not for everything)") + parser.add_argument('--host', + default="127.0.0.1", + help="listen on this host/IP") + parser.add_argument('--port', + type=int, + default=9810, + help="listen on this port") + parser.add_argument('--database-uri', + default=app.config['SQLALCHEMY_DATABASE_URI'], + help="sqlalchemy database string") + parser.add_argument('--init-db', + action='store_true', + help="create database tables and insert dummy data") + args = parser.parse_args() + + app.config['SQLALCHEMY_DATABASE_URI'] = args.database_uri + + if args.init_db: + db.create_all() + fatcat.sql.populate_db() + print("Dummy database configured: " + app.config['SQLALCHEMY_DATABASE_URI']) + return + + app.run(debug=args.debug, host=args.host, port=args.port) + +if __name__ == '__main__': + main() diff --git a/python/run.py b/python/run.py deleted file mode 100755 index cfddad48..00000000 --- a/python/run.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -from fatcat import app - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--debug', - action='store_true', - help="enable debugging interface (note: not for everything)") - parser.add_argument('--host', - default="127.0.0.1", - help="listen on this host/IP") - parser.add_argument('--port', - type=int, - default=9810, - help="listen on this port") - parser.add_argument('--database-uri', - default=app.config['SQLALCHEMY_DATABASE_URI'], - help="sqlalchemy database string") - parser.add_argument('--init-db', - action='store_true', - help="create database tables and insert dummy data") - args = parser.parse_args() - - app.config['SQLALCHEMY_DATABASE_URI'] = args.database_uri - - if args.init_db: - db.create_all() - fatcat.sql.populate_db() - print("Dummy database configured: " + app.config['SQLALCHEMY_DATABASE_URI']) - return - - app.run(debug=args.debug, host=args.host, port=args.port) - -if __name__ == '__main__': - main() -- cgit v1.2.3