diff options
Diffstat (limited to 'python')
32 files changed, 196 insertions, 107 deletions
| diff --git a/python/Pipfile b/python/Pipfile index be62c111..f4137dca 100644 --- a/python/Pipfile +++ b/python/Pipfile @@ -1,3 +1,8 @@ + +# This file is *not* used as part of bundling or distributing the python client +# library (fatcat_client). It *is* shared by the web interface (flask app), +# workers, and import scripts. +  [[source]]  url = "https://pypi.python.org/simple"  verify_ssl = true @@ -28,4 +33,8 @@ pykafka = "*"  python-dateutil = "*"  [requires] +# Python 3.5 is the bundled (system) version of python for Ubuntu 16.04 +# (Xenial), currently the default on Internet Archive production VMs, as well +# as Debian stable (stretch). Will probably bump to 3.6 in early 2019 when +# updating both of these environments.  python_version = "3.5" diff --git a/python/README.md b/python/README.md index 9b244e2d..0cfbab21 100644 --- a/python/README.md +++ b/python/README.md @@ -23,10 +23,10 @@ locally):  ## Web Interface  This project uses `pipenv` to manage dependencies, and assumes Python 3.5 -(which pipenv may install if you are running a different local version). You -can can install `pipenv` with `pip`. You may want to set the -`PIPENV_VENV_IN_PROJECT` environment variable on your development machine (see -pipenv docs for details). +(which pipenv may install if you are running a different local version; see +notes in Pipfile). You can can install `pipenv` with `pip`. You may want to set +the `PIPENV_VENV_IN_PROJECT` environment variable on your development machine +(see pipenv docs for details).  To just run the web interface (which will try to connect to a back-end API  server on the same machine by default), use: diff --git a/python/README_import.md b/python/README_import.md index 0264610b..6334dbc6 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -36,7 +36,7 @@ the others:  From CSV file:      # See "start off with" command above -    time ./fatcat_import.py import-issn /srv/fatcat/datasets/journal_extra_metadata.csv +    time ./fatcat_import.py issn /srv/fatcat/datasets/journal_extra_metadata.csv  Usually a couple minutes at most on fast production machine. @@ -44,23 +44,23 @@ Usually a couple minutes at most on fast production machine.  Usually tens of minutes on fast production machine. -    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - +    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py orcid -  ## Crossref  Usually 24 hours or so on fast production machine. -    time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 +    time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3  ## Matched  Unknown speed!      # No file update for the first import... -    zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update - +    zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-update -      # ... but do on the second -    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - +    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -      # GROBID extracted (release+file) -    time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-grobid-metadata - +    time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata - diff --git a/python/codegen_python_client.sh b/python/codegen_python_client.sh index 4c146f3f..85d12dc4 100755 --- a/python/codegen_python_client.sh +++ b/python/codegen_python_client.sh @@ -1,5 +1,8 @@  #!/bin/bash +# This script re-generates the fatcat API client (fatcat_client) from the +# swagger/openapi2 spec file, using automated tools ("codegen") +  set -exu  set -o pipefail diff --git a/python/conftest.py b/python/conftest.py index aad02e76..c2a31562 100644 --- a/python/conftest.py +++ b/python/conftest.py @@ -1,3 +1,8 @@ +""" +This file exists soley to prevent pytest from trying to import/test the +fatcat_client ./setup.py file. +""" +  import sys  collect_ignore = ["setup.py"] diff --git a/python/fatcat_export.py b/python/fatcat_export.py index eadf69ab..60c1caf1 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -1,11 +1,18 @@  #!/usr/bin/env python3 +""" +Note: this is *not* the tool used to generate "official" metadata dumps; that +tool is written in rust and runs on the production infrastructure for speed. +These scripts are just a demonstration of how the API *could* be scraped +without permission by an third party. +""" +  import sys  import json  import argparse  import fatcat_client  from fatcat_client.rest import ApiException -from fatcat_tools.fcid import uuid2fcid +from fatcat_tools import uuid2fcid  def run_export_releases(args):      conf = fatcat_client.Configuration() diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 0ec0cfa8..a5527b8c 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -2,36 +2,34 @@  import sys  import argparse -from fatcat_tools.importers.crossref import FatcatCrossrefImporter -from fatcat_tools.importers.orcid import FatcatOrcidImporter -from fatcat_tools.importers.issn import FatcatIssnImporter -from fatcat_tools.importers.matched import FatcatMatchedImporter -from fatcat_tools.importers.grobid_metadata import FatcatGrobidMetadataImporter +from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \ +    IssnImporter, MatchedImporter, GrobidMetadataImporter  -def run_import_crossref(args): -    fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file, + +def run_crossref(args): +    fci = CrossrefImporter(args.host_url, args.issn_map_file,          args.extid_map_file, create_containers=(not args.no_create_containers))      fci.process_batch(args.json_file, size=args.batch_size)      fci.describe_run() -def run_import_orcid(args): -    foi = FatcatOrcidImporter(args.host_url) +def run_orcid(args): +    foi = OrcidImporter(args.host_url)      foi.process_batch(args.json_file, size=args.batch_size)      foi.describe_run() -def run_import_issn(args): -    fii = FatcatIssnImporter(args.host_url) +def run_issn(args): +    fii = IssnImporter(args.host_url)      fii.process_csv_batch(args.csv_file, size=args.batch_size)      fii.describe_run() -def run_import_matched(args): -    fmi = FatcatMatchedImporter(args.host_url, +def run_matched(args): +    fmi = MatchedImporter(args.host_url,          skip_file_update=args.no_file_update)      fmi.process_batch(args.json_file, size=args.batch_size)      fmi.describe_run() -def run_import_grobid_metadata(args): -    fmi = FatcatGrobidMetadataImporter(args.host_url) +def run_grobid_metadata(args): +    fmi = GrobidMetadataImporter(args.host_url)      fmi.process_source(args.tsv_file, group_size=args.group_size)      fmi.describe_run() @@ -45,60 +43,60 @@ def main():          help="connect to this host/port")      subparsers = parser.add_subparsers() -    sub_import_crossref = subparsers.add_parser('import-crossref') -    sub_import_crossref.set_defaults(func=run_import_crossref) -    sub_import_crossref.add_argument('json_file', +    sub_crossref = subparsers.add_parser('crossref') +    sub_crossref.set_defaults(func=run_crossref) +    sub_crossref.add_argument('json_file',          help="crossref JSON file to import from",          default=sys.stdin, type=argparse.FileType('r')) -    sub_import_crossref.add_argument('issn_map_file', +    sub_crossref.add_argument('issn_map_file',          help="ISSN to ISSN-L mapping file",          default=None, type=argparse.FileType('r')) -    sub_import_crossref.add_argument('extid_map_file', +    sub_crossref.add_argument('extid_map_file',          help="DOI-to-other-identifiers sqlite3 database",          default=None, type=str) -    sub_import_crossref.add_argument('--no-create-containers', +    sub_crossref.add_argument('--no-create-containers',          action='store_true',          help="skip creation of new container entities based on ISSN") -    sub_import_crossref.add_argument('--batch-size', +    sub_crossref.add_argument('--batch-size',          help="size of batch to send",          default=50, type=int) -    sub_import_orcid = subparsers.add_parser('import-orcid') -    sub_import_orcid.set_defaults(func=run_import_orcid) -    sub_import_orcid.add_argument('json_file', +    sub_orcid = subparsers.add_parser('orcid') +    sub_orcid.set_defaults(func=run_orcid) +    sub_orcid.add_argument('json_file',          help="orcid JSON file to import from (or stdin)",          default=sys.stdin, type=argparse.FileType('r')) -    sub_import_orcid.add_argument('--batch-size', +    sub_orcid.add_argument('--batch-size',          help="size of batch to send",          default=50, type=int) -    sub_import_issn = subparsers.add_parser('import-issn') -    sub_import_issn.set_defaults(func=run_import_issn) -    sub_import_issn.add_argument('csv_file', +    sub_issn = subparsers.add_parser('issn') +    sub_issn.set_defaults(func=run_issn) +    sub_issn.add_argument('csv_file',          help="Journal ISSN CSV metadata file to import from (or stdin)",          default=sys.stdin, type=argparse.FileType('r')) -    sub_import_issn.add_argument('--batch-size', +    sub_issn.add_argument('--batch-size',          help="size of batch to send",          default=50, type=int) -    sub_import_matched = subparsers.add_parser('import-matched') -    sub_import_matched.set_defaults(func=run_import_matched) -    sub_import_matched.add_argument('json_file', +    sub_matched = subparsers.add_parser('matched') +    sub_matched.set_defaults(func=run_matched) +    sub_matched.add_argument('json_file',          help="JSON file to import from (or stdin)",          default=sys.stdin, type=argparse.FileType('r')) -    sub_import_matched.add_argument('--no-file-update', +    sub_matched.add_argument('--no-file-update',          action='store_true',          help="don't lookup existing files, just insert (only for bootstrap)") -    sub_import_matched.add_argument('--batch-size', +    sub_matched.add_argument('--batch-size',          help="size of batch to send",          default=50, type=int) -    sub_import_grobid_metadata = subparsers.add_parser('import-grobid-metadata') -    sub_import_grobid_metadata.set_defaults(func=run_import_grobid_metadata) -    sub_import_grobid_metadata.add_argument('tsv_file', +    sub_grobid_metadata = subparsers.add_parser('grobid-metadata') +    sub_grobid_metadata.set_defaults(func=run_grobid_metadata) +    sub_grobid_metadata.add_argument('tsv_file',          help="TSV file to import from (or stdin)",          default=sys.stdin, type=argparse.FileType('r')) -    sub_import_grobid_metadata.add_argument('--group-size', +    sub_grobid_metadata.add_argument('--group-size',          help="editgroup group size to use",          default=75, type=int) diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py new file mode 100644 index 00000000..0bb42ab5 --- /dev/null +++ b/python/fatcat_tools/__init__.py @@ -0,0 +1,3 @@ + +from .fcid import fcid2uuid, uuid2fcid +from .transforms import entity_to_json, entity_from_json, release_to_elasticsearch diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py index dd72b242..4194ea63 100644 --- a/python/fatcat_tools/fcid.py +++ b/python/fatcat_tools/fcid.py @@ -3,12 +3,18 @@ import base64  import uuid  def fcid2uuid(s): +    """ +    Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object +    """      s = s.split('_')[-1].upper().encode('utf-8')      assert len(s) == 26      raw = base64.b32decode(s + b"======")      return str(uuid.UUID(bytes=raw)).lower()  def uuid2fcid(s): +    """ +    Converts a uuid.UUID object to a fatcat identifier (base32 encoded string) +    """      raw = uuid.UUID(s).bytes      return base64.b32encode(raw)[:26].lower().decode('utf-8') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py new file mode 100644 index 00000000..0f5fafb6 --- /dev/null +++ b/python/fatcat_tools/importers/__init__.py @@ -0,0 +1,7 @@ + +from .common import FatcatImporter +from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP +from .grobid_metadata import GrobidMetadataImporter +from .issn import IssnImporter +from .matched import MatchedImporter +from .orcid import OrcidImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index d289171d..9cf92b41 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -16,6 +16,9 @@ def grouper(iterable, n, fillvalue=None):      return itertools.zip_longest(*args, fillvalue=fillvalue)  class FatcatImporter: +    """ +    Base class for fatcat importers +    """      def __init__(self, host_url, issn_map_file=None):          conf = fatcat_client.Configuration() diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fe80c2d3..fac8f32b 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -5,9 +5,11 @@ import sqlite3  import datetime  import itertools  import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter +# The docs/guide should be the cannonical home for these mappings; update there +# first  CROSSREF_TYPE_MAP = {      'book': 'book',      'book-chapter': 'chapter', @@ -29,8 +31,14 @@ CROSSREF_TYPE_MAP = {      'standard': 'standard',  } +class CrossrefImporter(FatcatImporter): +    """ +    Importer for Crossref metadata. -class FatcatCrossrefImporter(FatcatImporter): +    Can use a local sqlite3 file for faster "external identifier" lookups + +    See https://github.com/CrossRef/rest-api-doc for JSON schema notes +    """      def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):          super().__init__(host_url, issn_map_file) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index dedc9728..ba8a4e6f 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,12 +5,12 @@ import json  import base64  import datetime  import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter  MAX_ABSTRACT_BYTES=4096 -class FatcatGrobidMetadataImporter(FatcatImporter): +class GrobidMetadataImporter(FatcatImporter):      def __init__(self, host_url, default_link_rel="web"):          super().__init__(host_url) diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py index ba8492c6..0b0efccb 100644 --- a/python/fatcat_tools/importers/issn.py +++ b/python/fatcat_tools/importers/issn.py @@ -3,10 +3,8 @@ import sys  import json  import itertools  import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter -# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): -# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count  def or_none(s):      if s is None: @@ -26,7 +24,15 @@ def truthy(s):      else:          return None -class FatcatIssnImporter(FatcatImporter): +class IssnImporter(FatcatImporter): +    """ +    Imports journal metadata ("containers") by ISSN, currently from a custom +    (data munged) .csv file format + +    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): + +        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count +    """      def parse_issn_row(self, row):          """ diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 774019c7..732fccbe 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,7 +4,7 @@ import json  import sqlite3  import itertools  import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter  #row = row.split('\t')  #assert len(row) == 2 @@ -13,8 +13,14 @@ from fatcat_tools.importers.common import FatcatImporter  #print(sha1)  #dois = [d.lower() for d in json.loads(row[1])] -class FatcatMatchedImporter(FatcatImporter): +class MatchedImporter(FatcatImporter):      """ +    Importer for "file to crossref DOI" matches. + +    These matches are currently generated by Internet Archive hadoop jobs +    written in scala (part of the 'sandcrawler' repo/project), but could be +    generated by other parties as well. +      Input format is JSON with keys:      - dois (list)      - sha1 (hex) diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 527316dd..9e4767f9 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -3,7 +3,7 @@ import sys  import json  import itertools  import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter  def value_or_none(e):      if type(e) == dict: @@ -20,7 +20,7 @@ def value_or_none(e):              return None      return e -class FatcatOrcidImporter(FatcatImporter): +class OrcidImporter(FatcatImporter):      def parse_orcid_dict(self, obj):          """ diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index e10c6ba5..147acd7c 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -3,6 +3,9 @@ import collections  from fatcat_client import ReleaseEntity, ApiClient  def entity_to_json(entity): +    """ +    Hack to take advantage of the code-generated serialization code +    """      ac = ApiClient()      return ac.sanitize_for_serialization(entity) @@ -15,11 +18,12 @@ def entity_from_json(json_str, entity_type):      thing.data = json_str      return ac.deserialize(thing, entity_type) -def release_elastic_dict(release): +def release_to_elasticsearch(release):      """      Converts from an entity model/schema to elasticsearch oriented schema.      Returns: dict +    Raises exception on error (never returns None)      """      if release.state != 'active': diff --git a/python/fatcat_tools/workers/__init__.py b/python/fatcat_tools/workers/__init__.py new file mode 100644 index 00000000..e8973bc3 --- /dev/null +++ b/python/fatcat_tools/workers/__init__.py @@ -0,0 +1,4 @@ + +from .changelog import ChangelogWorker, EntityUpdatesWorker +from .elasticsearch import ElasticsearchReleaseWorker +from .worker_common import most_recent_message, FatcatWorker diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index d07b5988..e64c043b 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -1,11 +1,12 @@  import json  import time -from fatcat_tools.workers.worker_common import FatcatWorker, most_recent_message  from pykafka.common import OffsetType +from .worker_common import FatcatWorker, most_recent_message -class FatcatChangelogWorker(FatcatWorker): + +class ChangelogWorker(FatcatWorker):      """      Periodically polls the fatcat API looking for new changelogs. When they are      found, fetch them and push (as JSON) into a Kafka topic. @@ -52,7 +53,7 @@ class FatcatChangelogWorker(FatcatWorker):                  time.sleep(self.poll_interval) -class FatcatEntityUpdatesWorker(FatcatWorker): +class EntityUpdatesWorker(FatcatWorker):      """      Consumes from the changelog topic and publishes expanded entities (fetched      from API) to update topics. diff --git a/python/fatcat_tools/workers/elastic.py b/python/fatcat_tools/workers/elasticsearch.py index 3a75a1b3..e7abd5ee 100644 --- a/python/fatcat_tools/workers/elastic.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -2,14 +2,14 @@  import json  import time  import requests -from fatcat_tools.transforms import release_elastic_dict -from fatcat_tools.workers.worker_common import FatcatWorker -from fatcat_client import ReleaseEntity -from fatcat_tools.transforms import *  from pykafka.common import OffsetType +from fatcat_client import ReleaseEntity +from fatcat_tools import * +from .worker_common import FatcatWorker + -class FatcatElasticReleaseWorker(FatcatWorker): +class ElasticsearchReleaseWorker(FatcatWorker):      """      Consumes from release-updates topic and pushes into (presumably local)      elasticsearch. @@ -18,13 +18,13 @@ class FatcatElasticReleaseWorker(FatcatWorker):      """      def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, -            elastic_backend="http://localhost:9200", elastic_index="fatcat"): +            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):          super().__init__(kafka_hosts=kafka_hosts,                           consume_topic=consume_topic,                           api_host_url=None) -        self.consumer_group = "elastic-updates" -        self.elastic_backend = elastic_backend -        self.elastic_index = elastic_index +        self.consumer_group = "elasticsearch-updates" +        self.elasticsearch_backend = elasticsearch_backend +        self.elasticsearch_index = elasticsearch_index      def run(self):          consume_topic = self.kafka.topics[self.consume_topic] @@ -42,11 +42,11 @@ class FatcatElasticReleaseWorker(FatcatWorker):              json_str = msg.value.decode('utf-8')              release = entity_from_json(json_str, ReleaseEntity)              #print(release) -            elastic_endpoint = "{}/{}/release/{}".format( -                self.elastic_backend, -                self.elastic_index, +            elasticsearch_endpoint = "{}/{}/release/{}".format( +                self.elasticsearch_backend, +                self.elasticsearch_index,                  release.ident) -            print("Updating document: {}".format(elastic_endpoint)) -            resp = requests.post(elastic_endpoint, json=release_elastic_dict(release)) +            print("Updating document: {}".format(elasticsearch_endpoint)) +            resp = requests.post(elasticsearch_endpoint, json=release_to_elasticsearch(release))              assert resp.status_code in (200, 201)              #consumer.commit_offsets() diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 7fcdabea..0a3ea40e 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -3,6 +3,10 @@ import requests  from flask import abort  from fatcat_web import app +""" +Helpers for doing elasticsearch queries (used in the web interface; not part of +the formal API) +"""  def do_search(q, limit=50, fulltext_only=True): diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index 4c52d2c1..2f883fe0 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -3,27 +3,27 @@  import sys  import argparse  import datetime -from fatcat_tools.workers.changelog import FatcatChangelogWorker, FatcatEntityUpdatesWorker -from fatcat_tools.workers.elastic import FatcatElasticReleaseWorker +from fatcat_tools.workers import ChangelogWorker, EntityUpdatesWorker, ElasticsearchReleaseWorker +  def run_changelog(args):      topic = "fatcat-{}.changelog".format(args.env) -    worker = FatcatChangelogWorker(args.api_host_url, args.kafka_hosts, topic, +    worker = ChangelogWorker(args.api_host_url, args.kafka_hosts, topic,          args.poll_interval)      worker.run()  def run_entity_updates(args):      changelog_topic = "fatcat-{}.changelog".format(args.env)      release_topic = "fatcat-{}.release-updates".format(args.env) -    worker = FatcatEntityUpdatesWorker(args.api_host_url, args.kafka_hosts, +    worker = EntityUpdatesWorker(args.api_host_url, args.kafka_hosts,          changelog_topic, release_topic)      worker.run() -def run_elastic_release(args): +def run_elasticsearch_release(args):      consume_topic = "fatcat-{}.release-updates".format(args.env) -    worker = FatcatElasticReleaseWorker(args.kafka_hosts, -        consume_topic, elastic_backend=args.elastic_backend, -        elastic_index=args.elastic_index) +    worker = ReleaseWorker(args.kafka_hosts, +        consume_topic, elasticsearch_backend=args.elasticsearch_backend, +        elasticsearch_index=args.elasticsearch_index)      worker.run()  def main(): @@ -51,12 +51,12 @@ def main():      sub_entity_updates = subparsers.add_parser('entity-updates')      sub_entity_updates.set_defaults(func=run_entity_updates) -    sub_elastic_release = subparsers.add_parser('elastic-release') -    sub_elastic_release.set_defaults(func=run_elastic_release) -    sub_elastic_release.add_argument('--elastic-backend', +    sub_elasticsearch_release = subparsers.add_parser('elasticsearch-release') +    sub_elasticsearch_release.set_defaults(func=run_elasticsearch_release) +    sub_elasticsearch_release.add_argument('--elasticsearch-backend',          help="elasticsearch backend to connect to",          default="http://localhost:9200") -    sub_elastic_release.add_argument('--elastic-index', +    sub_elasticsearch_release.add_argument('--elasticsearch-index',          help="elasticsearch index to push into",          default="fatcat") diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index 078db184..c129e729 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -1,13 +1,13 @@  import json  import pytest -from fatcat_tools.importers.crossref import FatcatCrossrefImporter +from fatcat_tools.importers import CrossrefImporter  @pytest.fixture(scope="function")  def crossref_importer():      with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: -        yield FatcatCrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3') +        yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3')  def test_crossref_importer_batch(crossref_importer):      with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py index 8b268e21..ee7040c9 100644 --- a/python/tests/import_grobid_metadata.py +++ b/python/tests/import_grobid_metadata.py @@ -3,7 +3,7 @@ import os  import json  import base64  import pytest -from fatcat_tools.importers.grobid_metadata import FatcatGrobidMetadataImporter +from fatcat_tools.importers import GrobidMetadataImporter  """  WARNING: these tests are currently very fragile because they have database @@ -12,7 +12,7 @@ side-effects. Should probably be disabled or re-written.  @pytest.fixture(scope="function")  def grobid_metadata_importer(): -    yield FatcatGrobidMetadataImporter("http://localhost:9411/v0") +    yield GrobidMetadataImporter("http://localhost:9411/v0")  # TODO: use API to check that entities actually created...  #def test_grobid_metadata_importer_batch(grobid_metadata_importer): diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py index f45747ed..98a9f4a7 100644 --- a/python/tests/import_issn.py +++ b/python/tests/import_issn.py @@ -1,11 +1,11 @@  import pytest -from fatcat_tools.importers.issn import FatcatIssnImporter +from fatcat_tools.importers import IssnImporter  @pytest.fixture(scope="function")  def issn_importer(): -    yield FatcatIssnImporter("http://localhost:9411/v0") +    yield IssnImporter("http://localhost:9411/v0")  # TODO: use API to check that entities actually created...  def test_issn_importer_batch(issn_importer): diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py index 8004e3bd..85e21267 100644 --- a/python/tests/import_matched.py +++ b/python/tests/import_matched.py @@ -1,12 +1,12 @@  import json  import pytest -from fatcat_tools.importers.matched import FatcatMatchedImporter +from fatcat_tools.importers import MatchedImporter  @pytest.fixture(scope="function")  def matched_importer(): -    yield FatcatMatchedImporter("http://localhost:9411/v0") +    yield MatchedImporter("http://localhost:9411/v0")  # TODO: use API to check that entities actually created...  def test_matched_importer_batch(matched_importer): diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py index 2dc98d76..18199888 100644 --- a/python/tests/import_orcid.py +++ b/python/tests/import_orcid.py @@ -1,12 +1,12 @@  import json  import pytest -from fatcat_tools.importers.orcid import FatcatOrcidImporter +from fatcat_tools.importers import OrcidImporter  @pytest.fixture(scope="function")  def orcid_importer(): -    yield FatcatOrcidImporter("http://localhost:9411/v0") +    yield OrcidImporter("http://localhost:9411/v0")  # TODO: use API to check that entities actually created...  def test_orcid_importer_batch(orcid_importer): diff --git a/python/tests/importer.py b/python/tests/importer.py index d98638e4..f228a9b2 100644 --- a/python/tests/importer.py +++ b/python/tests/importer.py @@ -1,7 +1,7 @@  import pytest -from fatcat_tools.importers.common import FatcatImporter +from fatcat_tools.importers import FatcatImporter  def test_issnl_mapping_lookup(): diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py index 52a9965a..18522eff 100644 --- a/python/tests/transform_tests.py +++ b/python/tests/transform_tests.py @@ -1,15 +1,14 @@  import json  import pytest -from fatcat_tools.importers.crossref import FatcatCrossrefImporter -from fatcat_tools.transforms import * +from fatcat_tools import *  from import_crossref import crossref_importer -def test_elastic_convert(crossref_importer): +def test_elasticsearch_convert(crossref_importer):      with open('tests/files/crossref-works.single.json', 'r') as f:          # not a single line          raw = json.loads(f.read())          (r, c) = crossref_importer.parse_crossref_dict(raw)      r.state = 'active' -    release_elastic_dict(r) +    release_to_elasticsearch(r) diff --git a/python/tox.ini b/python/tox.ini index d4fc1d9d..f9d811cd 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -1,3 +1,6 @@ +# tox isn't actually used for anything right now, but might be a better tool +# for CI or for testing compatibility with multiple python versions +  [tox]  envlist = py35 diff --git a/python/uwsgi.ini b/python/uwsgi.ini index fdf021b6..a03f72bb 100644 --- a/python/uwsgi.ini +++ b/python/uwsgi.ini @@ -1,3 +1,6 @@ +# uwsgi is used in production deployment, in combination with pipenv (which +# generates the .venv file) +  [uwsgi]  plugin = python3,http  http = :9810 diff --git a/python/web_config.py b/python/web_config.py index ed6d830a..749d0586 100644 --- a/python/web_config.py +++ b/python/web_config.py @@ -1,4 +1,14 @@ +""" +Default configuration for fatcat web interface (Flask application). + +In production, we currently reconfigure these values using environment +variables, not by (eg) deploying a variant copy of this file. + +This config is *only* for the web interface, *not* for any of the workers or +import scripts. +""" +  import os  import subprocess | 
