large refactor of python names/paths

- Add __init__.py files for fatcat_tools submodules, and use them in imports - Add a bunch of comments to files. - rename a number of classes and functions to be less verbose
author: Bryan Newbold <bnewbold@robocracy.org> 2018-11-15 13:11:52 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-11-15 13:15:15 -0800
commit: bb28a3fc1cc900f2dde31e1dbc492d9661034f41 (patch)
tree: f037dd3d1bab6cbf08a562dbdd4c09361fe0c030
parent: 9f817c6c70a749f2ac449ab4edfd26c6dd8a7410 (diff)
download: fatcat-bb28a3fc1cc900f2dde31e1dbc492d9661034f41.tar.gz
fatcat-bb28a3fc1cc900f2dde31e1dbc492d9661034f41.zip
32 files changed, 196 insertions, 107 deletions
diff --git a/python/Pipfile b/python/Pipfile
index be62c111..f4137dca 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -1,3 +1,8 @@
+
+# This file is *not* used as part of bundling or distributing the python client
+# library (fatcat_client). It *is* shared by the web interface (flask app),
+# workers, and import scripts.
+
 [[source]]
 url = "https://pypi.python.org/simple"
 verify_ssl = true
@@ -28,4 +33,8 @@ pykafka = "*"
 python-dateutil = "*"
 
 [requires]
+# Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
+# (Xenial), currently the default on Internet Archive production VMs, as well
+# as Debian stable (stretch). Will probably bump to 3.6 in early 2019 when
+# updating both of these environments.
 python_version = "3.5"
diff --git a/python/README.md b/python/README.md
index 9b244e2d..0cfbab21 100644
--- a/python/README.md
+++ b/python/README.md
@@ -23,10 +23,10 @@ locally):
 ## Web Interface
 
 This project uses `pipenv` to manage dependencies, and assumes Python 3.5
-(which pipenv may install if you are running a different local version). You
-can can install `pipenv` with `pip`. You may want to set the
-`PIPENV_VENV_IN_PROJECT` environment variable on your development machine (see
-pipenv docs for details).
+(which pipenv may install if you are running a different local version; see
+notes in Pipfile). You can can install `pipenv` with `pip`. You may want to set
+the `PIPENV_VENV_IN_PROJECT` environment variable on your development machine
+(see pipenv docs for details).
 
 To just run the web interface (which will try to connect to a back-end API
 server on the same machine by default), use:
diff --git a/python/README_import.md b/python/README_import.md
index 0264610b..6334dbc6 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -36,7 +36,7 @@ the others:
 From CSV file:
 
     # See "start off with" command above
-    time ./fatcat_import.py import-issn /srv/fatcat/datasets/journal_extra_metadata.csv
+    time ./fatcat_import.py issn /srv/fatcat/datasets/journal_extra_metadata.csv
 
 Usually a couple minutes at most on fast production machine.
 
@@ -44,23 +44,23 @@ Usually a couple minutes at most on fast production machine.
 
 Usually tens of minutes on fast production machine.
 
-    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
+    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py orcid -
 
 ## Crossref
 
 Usually 24 hours or so on fast production machine.
 
-    time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+    time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
 
 ## Matched
 
 Unknown speed!
 
     # No file update for the first import...
-    zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update -
+    zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-update -
 
     # ... but do on the second
-    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
+    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -
 
     # GROBID extracted (release+file)
-    time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-grobid-metadata -
+    time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata -
diff --git a/python/codegen_python_client.sh b/python/codegen_python_client.sh
index 4c146f3f..85d12dc4 100755
--- a/python/codegen_python_client.sh
+++ b/python/codegen_python_client.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# This script re-generates the fatcat API client (fatcat_client) from the
+# swagger/openapi2 spec file, using automated tools ("codegen")
+
 set -exu
 set -o pipefail
 
diff --git a/python/conftest.py b/python/conftest.py
index aad02e76..c2a31562 100644
--- a/python/conftest.py
+++ b/python/conftest.py
@@ -1,3 +1,8 @@
+"""
+This file exists soley to prevent pytest from trying to import/test the
+fatcat_client ./setup.py file.
+"""
+
 import sys
 
 collect_ignore = ["setup.py"]
diff --git a/python/fatcat_export.py b/python/fatcat_export.py
index eadf69ab..60c1caf1 100755
--- a/python/fatcat_export.py
+++ b/python/fatcat_export.py
@@ -1,11 +1,18 @@
 #!/usr/bin/env python3
 
+"""
+Note: this is *not* the tool used to generate "official" metadata dumps; that
+tool is written in rust and runs on the production infrastructure for speed.
+These scripts are just a demonstration of how the API *could* be scraped
+without permission by an third party.
+"""
+
 import sys
 import json
 import argparse
 import fatcat_client
 from fatcat_client.rest import ApiException
-from fatcat_tools.fcid import uuid2fcid
+from fatcat_tools import uuid2fcid
 
 def run_export_releases(args):
     conf = fatcat_client.Configuration()
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 0ec0cfa8..a5527b8c 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -2,36 +2,34 @@
 
 import sys
 import argparse
-from fatcat_tools.importers.crossref import FatcatCrossrefImporter
-from fatcat_tools.importers.orcid import FatcatOrcidImporter
-from fatcat_tools.importers.issn import FatcatIssnImporter
-from fatcat_tools.importers.matched import FatcatMatchedImporter
-from fatcat_tools.importers.grobid_metadata import FatcatGrobidMetadataImporter
+from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \
+    IssnImporter, MatchedImporter, GrobidMetadataImporter 
 
-def run_import_crossref(args):
-    fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file,
+
+def run_crossref(args):
+    fci = CrossrefImporter(args.host_url, args.issn_map_file,
         args.extid_map_file, create_containers=(not args.no_create_containers))
     fci.process_batch(args.json_file, size=args.batch_size)
     fci.describe_run()
 
-def run_import_orcid(args):
-    foi = FatcatOrcidImporter(args.host_url)
+def run_orcid(args):
+    foi = OrcidImporter(args.host_url)
     foi.process_batch(args.json_file, size=args.batch_size)
     foi.describe_run()
 
-def run_import_issn(args):
-    fii = FatcatIssnImporter(args.host_url)
+def run_issn(args):
+    fii = IssnImporter(args.host_url)
     fii.process_csv_batch(args.csv_file, size=args.batch_size)
     fii.describe_run()
 
-def run_import_matched(args):
-    fmi = FatcatMatchedImporter(args.host_url,
+def run_matched(args):
+    fmi = MatchedImporter(args.host_url,
         skip_file_update=args.no_file_update)
     fmi.process_batch(args.json_file, size=args.batch_size)
     fmi.describe_run()
 
-def run_import_grobid_metadata(args):
-    fmi = FatcatGrobidMetadataImporter(args.host_url)
+def run_grobid_metadata(args):
+    fmi = GrobidMetadataImporter(args.host_url)
     fmi.process_source(args.tsv_file, group_size=args.group_size)
     fmi.describe_run()
 
@@ -45,60 +43,60 @@ def main():
         help="connect to this host/port")
     subparsers = parser.add_subparsers()
 
-    sub_import_crossref = subparsers.add_parser('import-crossref')
-    sub_import_crossref.set_defaults(func=run_import_crossref)
-    sub_import_crossref.add_argument('json_file',
+    sub_crossref = subparsers.add_parser('crossref')
+    sub_crossref.set_defaults(func=run_crossref)
+    sub_crossref.add_argument('json_file',
         help="crossref JSON file to import from",
         default=sys.stdin, type=argparse.FileType('r'))
-    sub_import_crossref.add_argument('issn_map_file',
+    sub_crossref.add_argument('issn_map_file',
         help="ISSN to ISSN-L mapping file",
         default=None, type=argparse.FileType('r'))
-    sub_import_crossref.add_argument('extid_map_file',
+    sub_crossref.add_argument('extid_map_file',
         help="DOI-to-other-identifiers sqlite3 database",
         default=None, type=str)
-    sub_import_crossref.add_argument('--no-create-containers',
+    sub_crossref.add_argument('--no-create-containers',
         action='store_true',
         help="skip creation of new container entities based on ISSN")
-    sub_import_crossref.add_argument('--batch-size',
+    sub_crossref.add_argument('--batch-size',
         help="size of batch to send",
         default=50, type=int)
 
-    sub_import_orcid = subparsers.add_parser('import-orcid')
-    sub_import_orcid.set_defaults(func=run_import_orcid)
-    sub_import_orcid.add_argument('json_file',
+    sub_orcid = subparsers.add_parser('orcid')
+    sub_orcid.set_defaults(func=run_orcid)
+    sub_orcid.add_argument('json_file',
         help="orcid JSON file to import from (or stdin)",
         default=sys.stdin, type=argparse.FileType('r'))
-    sub_import_orcid.add_argument('--batch-size',
+    sub_orcid.add_argument('--batch-size',
         help="size of batch to send",
         default=50, type=int)
 
-    sub_import_issn = subparsers.add_parser('import-issn')
-    sub_import_issn.set_defaults(func=run_import_issn)
-    sub_import_issn.add_argument('csv_file',
+    sub_issn = subparsers.add_parser('issn')
+    sub_issn.set_defaults(func=run_issn)
+    sub_issn.add_argument('csv_file',
         help="Journal ISSN CSV metadata file to import from (or stdin)",
         default=sys.stdin, type=argparse.FileType('r'))
-    sub_import_issn.add_argument('--batch-size',
+    sub_issn.add_argument('--batch-size',
         help="size of batch to send",
         default=50, type=int)
 
-    sub_import_matched = subparsers.add_parser('import-matched')
-    sub_import_matched.set_defaults(func=run_import_matched)
-    sub_import_matched.add_argument('json_file',
+    sub_matched = subparsers.add_parser('matched')
+    sub_matched.set_defaults(func=run_matched)
+    sub_matched.add_argument('json_file',
         help="JSON file to import from (or stdin)",
         default=sys.stdin, type=argparse.FileType('r'))
-    sub_import_matched.add_argument('--no-file-update',
+    sub_matched.add_argument('--no-file-update',
         action='store_true',
         help="don't lookup existing files, just insert (only for bootstrap)")
-    sub_import_matched.add_argument('--batch-size',
+    sub_matched.add_argument('--batch-size',
         help="size of batch to send",
         default=50, type=int)
 
-    sub_import_grobid_metadata = subparsers.add_parser('import-grobid-metadata')
-    sub_import_grobid_metadata.set_defaults(func=run_import_grobid_metadata)
-    sub_import_grobid_metadata.add_argument('tsv_file',
+    sub_grobid_metadata = subparsers.add_parser('grobid-metadata')
+    sub_grobid_metadata.set_defaults(func=run_grobid_metadata)
+    sub_grobid_metadata.add_argument('tsv_file',
         help="TSV file to import from (or stdin)",
         default=sys.stdin, type=argparse.FileType('r'))
-    sub_import_grobid_metadata.add_argument('--group-size',
+    sub_grobid_metadata.add_argument('--group-size',
         help="editgroup group size to use",
         default=75, type=int)
 
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
new file mode 100644
index 00000000..0bb42ab5
--- /dev/null
+++ b/python/fatcat_tools/__init__.py
@@ -0,0 +1,3 @@
+
+from .fcid import fcid2uuid, uuid2fcid
+from .transforms import entity_to_json, entity_from_json, release_to_elasticsearch
diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py
index dd72b242..4194ea63 100644
--- a/python/fatcat_tools/fcid.py
+++ b/python/fatcat_tools/fcid.py
@@ -3,12 +3,18 @@ import base64
 import uuid
 
 def fcid2uuid(s):
+    """
+    Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object
+    """
     s = s.split('_')[-1].upper().encode('utf-8')
     assert len(s) == 26
     raw = base64.b32decode(s + b"======")
     return str(uuid.UUID(bytes=raw)).lower()
 
 def uuid2fcid(s):
+    """
+    Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)
+    """
     raw = uuid.UUID(s).bytes
     return base64.b32encode(raw)[:26].lower().decode('utf-8')
 
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
new file mode 100644
index 00000000..0f5fafb6
--- /dev/null
+++ b/python/fatcat_tools/importers/__init__.py
@@ -0,0 +1,7 @@
+
+from .common import FatcatImporter
+from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .grobid_metadata import GrobidMetadataImporter
+from .issn import IssnImporter
+from .matched import MatchedImporter
+from .orcid import OrcidImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index d289171d..9cf92b41 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -16,6 +16,9 @@ def grouper(iterable, n, fillvalue=None):
     return itertools.zip_longest(*args, fillvalue=fillvalue)
 
 class FatcatImporter:
+    """
+    Base class for fatcat importers
+    """
 
     def __init__(self, host_url, issn_map_file=None):
         conf = fatcat_client.Configuration()
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index fe80c2d3..fac8f32b 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -5,9 +5,11 @@ import sqlite3
 import datetime
 import itertools
 import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
 
 
+# The docs/guide should be the cannonical home for these mappings; update there
+# first
 CROSSREF_TYPE_MAP = {
     'book': 'book',
     'book-chapter': 'chapter',
@@ -29,8 +31,14 @@ CROSSREF_TYPE_MAP = {
     'standard': 'standard',
 }
 
+class CrossrefImporter(FatcatImporter):
+    """
+    Importer for Crossref metadata.
 
-class FatcatCrossrefImporter(FatcatImporter):
+    Can use a local sqlite3 file for faster "external identifier" lookups
+
+    See https://github.com/CrossRef/rest-api-doc for JSON schema notes
+    """
 
     def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):
         super().__init__(host_url, issn_map_file)
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index dedc9728..ba8a4e6f 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,12 @@ import json
 import base64
 import datetime
 import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
 
 MAX_ABSTRACT_BYTES=4096
 
 
-class FatcatGrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(FatcatImporter):
 
     def __init__(self, host_url, default_link_rel="web"):
         super().__init__(host_url)
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
index ba8492c6..0b0efccb 100644
--- a/python/fatcat_tools/importers/issn.py
+++ b/python/fatcat_tools/importers/issn.py
@@ -3,10 +3,8 @@ import sys
 import json
 import itertools
 import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
 
-# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
 
 def or_none(s):
     if s is None:
@@ -26,7 +24,15 @@ def truthy(s):
     else:
         return None
 
-class FatcatIssnImporter(FatcatImporter):
+class IssnImporter(FatcatImporter):
+    """
+    Imports journal metadata ("containers") by ISSN, currently from a custom
+    (data munged) .csv file format
+
+    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+
+        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+    """
 
     def parse_issn_row(self, row):
         """
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 774019c7..732fccbe 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,7 +4,7 @@ import json
 import sqlite3
 import itertools
 import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
 
 #row = row.split('\t')
 #assert len(row) == 2
@@ -13,8 +13,14 @@ from fatcat_tools.importers.common import FatcatImporter
 #print(sha1)
 #dois = [d.lower() for d in json.loads(row[1])]
 
-class FatcatMatchedImporter(FatcatImporter):
+class MatchedImporter(FatcatImporter):
     """
+    Importer for "file to crossref DOI" matches.
+
+    These matches are currently generated by Internet Archive hadoop jobs
+    written in scala (part of the 'sandcrawler' repo/project), but could be
+    generated by other parties as well.
+
     Input format is JSON with keys:
     - dois (list)
     - sha1 (hex)
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 527316dd..9e4767f9 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
 import json
 import itertools
 import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
 
 def value_or_none(e):
     if type(e) == dict:
@@ -20,7 +20,7 @@ def value_or_none(e):
             return None
     return e
 
-class FatcatOrcidImporter(FatcatImporter):
+class OrcidImporter(FatcatImporter):
 
     def parse_orcid_dict(self, obj):
         """
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index e10c6ba5..147acd7c 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -3,6 +3,9 @@ import collections
 from fatcat_client import ReleaseEntity, ApiClient
 
 def entity_to_json(entity):
+    """
+    Hack to take advantage of the code-generated serialization code
+    """
     ac = ApiClient()
     return ac.sanitize_for_serialization(entity)
 
@@ -15,11 +18,12 @@ def entity_from_json(json_str, entity_type):
     thing.data = json_str
     return ac.deserialize(thing, entity_type)
 
-def release_elastic_dict(release):
+def release_to_elasticsearch(release):
     """
     Converts from an entity model/schema to elasticsearch oriented schema.
 
     Returns: dict
+    Raises exception on error (never returns None)
     """
 
     if release.state != 'active':
diff --git a/python/fatcat_tools/workers/__init__.py b/python/fatcat_tools/workers/__init__.py
new file mode 100644
index 00000000..e8973bc3
--- /dev/null
+++ b/python/fatcat_tools/workers/__init__.py
@@ -0,0 +1,4 @@
+
+from .changelog import ChangelogWorker, EntityUpdatesWorker
+from .elasticsearch import ElasticsearchReleaseWorker
+from .worker_common import most_recent_message, FatcatWorker
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index d07b5988..e64c043b 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -1,11 +1,12 @@
 
 import json
 import time
-from fatcat_tools.workers.worker_common import FatcatWorker, most_recent_message
 from pykafka.common import OffsetType
 
+from .worker_common import FatcatWorker, most_recent_message
 
-class FatcatChangelogWorker(FatcatWorker):
+
+class ChangelogWorker(FatcatWorker):
     """
     Periodically polls the fatcat API looking for new changelogs. When they are
     found, fetch them and push (as JSON) into a Kafka topic.
@@ -52,7 +53,7 @@ class FatcatChangelogWorker(FatcatWorker):
                 time.sleep(self.poll_interval)
 
 
-class FatcatEntityUpdatesWorker(FatcatWorker):
+class EntityUpdatesWorker(FatcatWorker):
     """
     Consumes from the changelog topic and publishes expanded entities (fetched
     from API) to update topics.
diff --git a/python/fatcat_tools/workers/elastic.py b/python/fatcat_tools/workers/elasticsearch.py
index 3a75a1b3..e7abd5ee 100644
--- a/python/fatcat_tools/workers/elastic.py
+++ b/python/fatcat_tools/workers/elasticsearch.py
@@ -2,14 +2,14 @@
 import json
 import time
 import requests
-from fatcat_tools.transforms import release_elastic_dict
-from fatcat_tools.workers.worker_common import FatcatWorker
-from fatcat_client import ReleaseEntity
-from fatcat_tools.transforms import *
 from pykafka.common import OffsetType
 
+from fatcat_client import ReleaseEntity
+from fatcat_tools import *
+from .worker_common import FatcatWorker
+
 
-class FatcatElasticReleaseWorker(FatcatWorker):
+class ElasticsearchReleaseWorker(FatcatWorker):
     """
     Consumes from release-updates topic and pushes into (presumably local)
     elasticsearch.
@@ -18,13 +18,13 @@ class FatcatElasticReleaseWorker(FatcatWorker):
     """
 
     def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
-            elastic_backend="http://localhost:9200", elastic_index="fatcat"):
+            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):
         super().__init__(kafka_hosts=kafka_hosts,
                          consume_topic=consume_topic,
                          api_host_url=None)
-        self.consumer_group = "elastic-updates"
-        self.elastic_backend = elastic_backend
-        self.elastic_index = elastic_index
+        self.consumer_group = "elasticsearch-updates"
+        self.elasticsearch_backend = elasticsearch_backend
+        self.elasticsearch_index = elasticsearch_index
 
     def run(self):
         consume_topic = self.kafka.topics[self.consume_topic]
@@ -42,11 +42,11 @@ class FatcatElasticReleaseWorker(FatcatWorker):
             json_str = msg.value.decode('utf-8')
             release = entity_from_json(json_str, ReleaseEntity)
             #print(release)
-            elastic_endpoint = "{}/{}/release/{}".format(
-                self.elastic_backend,
-                self.elastic_index,
+            elasticsearch_endpoint = "{}/{}/release/{}".format(
+                self.elasticsearch_backend,
+                self.elasticsearch_index,
                 release.ident)
-            print("Updating document: {}".format(elastic_endpoint))
-            resp = requests.post(elastic_endpoint, json=release_elastic_dict(release))
+            print("Updating document: {}".format(elasticsearch_endpoint))
+            resp = requests.post(elasticsearch_endpoint, json=release_to_elasticsearch(release))
             assert resp.status_code in (200, 201)
             #consumer.commit_offsets()
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 7fcdabea..0a3ea40e 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -3,6 +3,10 @@ import requests
 from flask import abort
 from fatcat_web import app
 
+"""
+Helpers for doing elasticsearch queries (used in the web interface; not part of
+the formal API)
+"""
 
 def do_search(q, limit=50, fulltext_only=True):
 
diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py
index 4c52d2c1..2f883fe0 100755
--- a/python/fatcat_worker.py
+++ b/python/fatcat_worker.py
@@ -3,27 +3,27 @@
 import sys
 import argparse
 import datetime
-from fatcat_tools.workers.changelog import FatcatChangelogWorker, FatcatEntityUpdatesWorker
-from fatcat_tools.workers.elastic import FatcatElasticReleaseWorker
+from fatcat_tools.workers import ChangelogWorker, EntityUpdatesWorker, ElasticsearchReleaseWorker
+
 
 def run_changelog(args):
     topic = "fatcat-{}.changelog".format(args.env)
-    worker = FatcatChangelogWorker(args.api_host_url, args.kafka_hosts, topic,
+    worker = ChangelogWorker(args.api_host_url, args.kafka_hosts, topic,
         args.poll_interval)
     worker.run()
 
 def run_entity_updates(args):
     changelog_topic = "fatcat-{}.changelog".format(args.env)
     release_topic = "fatcat-{}.release-updates".format(args.env)
-    worker = FatcatEntityUpdatesWorker(args.api_host_url, args.kafka_hosts,
+    worker = EntityUpdatesWorker(args.api_host_url, args.kafka_hosts,
         changelog_topic, release_topic)
     worker.run()
 
-def run_elastic_release(args):
+def run_elasticsearch_release(args):
     consume_topic = "fatcat-{}.release-updates".format(args.env)
-    worker = FatcatElasticReleaseWorker(args.kafka_hosts,
-        consume_topic, elastic_backend=args.elastic_backend,
-        elastic_index=args.elastic_index)
+    worker = ReleaseWorker(args.kafka_hosts,
+        consume_topic, elasticsearch_backend=args.elasticsearch_backend,
+        elasticsearch_index=args.elasticsearch_index)
     worker.run()
 
 def main():
@@ -51,12 +51,12 @@ def main():
     sub_entity_updates = subparsers.add_parser('entity-updates')
     sub_entity_updates.set_defaults(func=run_entity_updates)
 
-    sub_elastic_release = subparsers.add_parser('elastic-release')
-    sub_elastic_release.set_defaults(func=run_elastic_release)
-    sub_elastic_release.add_argument('--elastic-backend',
+    sub_elasticsearch_release = subparsers.add_parser('elasticsearch-release')
+    sub_elasticsearch_release.set_defaults(func=run_elasticsearch_release)
+    sub_elasticsearch_release.add_argument('--elasticsearch-backend',
         help="elasticsearch backend to connect to",
         default="http://localhost:9200")
-    sub_elastic_release.add_argument('--elastic-index',
+    sub_elasticsearch_release.add_argument('--elasticsearch-index',
         help="elasticsearch index to push into",
         default="fatcat")
 
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 078db184..c129e729 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -1,13 +1,13 @@
 
 import json
 import pytest
-from fatcat_tools.importers.crossref import FatcatCrossrefImporter
+from fatcat_tools.importers import CrossrefImporter
 
 
 @pytest.fixture(scope="function")
 def crossref_importer():
     with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
-        yield FatcatCrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3')
+        yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3')
 
 def test_crossref_importer_batch(crossref_importer):
     with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py
index 8b268e21..ee7040c9 100644
--- a/python/tests/import_grobid_metadata.py
+++ b/python/tests/import_grobid_metadata.py
@@ -3,7 +3,7 @@ import os
 import json
 import base64
 import pytest
-from fatcat_tools.importers.grobid_metadata import FatcatGrobidMetadataImporter
+from fatcat_tools.importers import GrobidMetadataImporter
 
 """
 WARNING: these tests are currently very fragile because they have database
@@ -12,7 +12,7 @@ side-effects. Should probably be disabled or re-written.
 
 @pytest.fixture(scope="function")
 def grobid_metadata_importer():
-    yield FatcatGrobidMetadataImporter("http://localhost:9411/v0")
+    yield GrobidMetadataImporter("http://localhost:9411/v0")
 
 # TODO: use API to check that entities actually created...
 #def test_grobid_metadata_importer_batch(grobid_metadata_importer):
diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py
index f45747ed..98a9f4a7 100644
--- a/python/tests/import_issn.py
+++ b/python/tests/import_issn.py
@@ -1,11 +1,11 @@
 
 import pytest
-from fatcat_tools.importers.issn import FatcatIssnImporter
+from fatcat_tools.importers import IssnImporter
 
 
 @pytest.fixture(scope="function")
 def issn_importer():
-    yield FatcatIssnImporter("http://localhost:9411/v0")
+    yield IssnImporter("http://localhost:9411/v0")
 
 # TODO: use API to check that entities actually created...
 def test_issn_importer_batch(issn_importer):
diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py
index 8004e3bd..85e21267 100644
--- a/python/tests/import_matched.py
+++ b/python/tests/import_matched.py
@@ -1,12 +1,12 @@
 
 import json
 import pytest
-from fatcat_tools.importers.matched import FatcatMatchedImporter
+from fatcat_tools.importers import MatchedImporter
 
 
 @pytest.fixture(scope="function")
 def matched_importer():
-    yield FatcatMatchedImporter("http://localhost:9411/v0")
+    yield MatchedImporter("http://localhost:9411/v0")
 
 # TODO: use API to check that entities actually created...
 def test_matched_importer_batch(matched_importer):
diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py
index 2dc98d76..18199888 100644
--- a/python/tests/import_orcid.py
+++ b/python/tests/import_orcid.py
@@ -1,12 +1,12 @@
 
 import json
 import pytest
-from fatcat_tools.importers.orcid import FatcatOrcidImporter
+from fatcat_tools.importers import OrcidImporter
 
 
 @pytest.fixture(scope="function")
 def orcid_importer():
-    yield FatcatOrcidImporter("http://localhost:9411/v0")
+    yield OrcidImporter("http://localhost:9411/v0")
 
 # TODO: use API to check that entities actually created...
 def test_orcid_importer_batch(orcid_importer):
diff --git a/python/tests/importer.py b/python/tests/importer.py
index d98638e4..f228a9b2 100644
--- a/python/tests/importer.py
+++ b/python/tests/importer.py
@@ -1,7 +1,7 @@
 
 
 import pytest
-from fatcat_tools.importers.common import FatcatImporter
+from fatcat_tools.importers import FatcatImporter
 
 
 def test_issnl_mapping_lookup():
diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py
index 52a9965a..18522eff 100644
--- a/python/tests/transform_tests.py
+++ b/python/tests/transform_tests.py
@@ -1,15 +1,14 @@
 
 import json
 import pytest
-from fatcat_tools.importers.crossref import FatcatCrossrefImporter
-from fatcat_tools.transforms import *
+from fatcat_tools import *
 
 from import_crossref import crossref_importer
 
-def test_elastic_convert(crossref_importer):
+def test_elasticsearch_convert(crossref_importer):
     with open('tests/files/crossref-works.single.json', 'r') as f:
         # not a single line
         raw = json.loads(f.read())
         (r, c) = crossref_importer.parse_crossref_dict(raw)
     r.state = 'active'
-    release_elastic_dict(r)
+    release_to_elasticsearch(r)
diff --git a/python/tox.ini b/python/tox.ini
index d4fc1d9d..f9d811cd 100644
--- a/python/tox.ini
+++ b/python/tox.ini
@@ -1,3 +1,6 @@
+# tox isn't actually used for anything right now, but might be a better tool
+# for CI or for testing compatibility with multiple python versions
+
 [tox]
 envlist = py35
 
diff --git a/python/uwsgi.ini b/python/uwsgi.ini
index fdf021b6..a03f72bb 100644
--- a/python/uwsgi.ini
+++ b/python/uwsgi.ini
@@ -1,3 +1,6 @@
+# uwsgi is used in production deployment, in combination with pipenv (which
+# generates the .venv file)
+
 [uwsgi]
 plugin = python3,http
 http = :9810
diff --git a/python/web_config.py b/python/web_config.py
index ed6d830a..749d0586 100644
--- a/python/web_config.py
+++ b/python/web_config.py
@@ -1,4 +1,14 @@
 
+"""
+Default configuration for fatcat web interface (Flask application).
+
+In production, we currently reconfigure these values using environment
+variables, not by (eg) deploying a variant copy of this file.
+
+This config is *only* for the web interface, *not* for any of the workers or
+import scripts.
+"""
+
 import os
 import subprocess
author	Bryan Newbold <bnewbold@robocracy.org>	2018-11-15 13:11:52 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-11-15 13:15:15 -0800
commit	bb28a3fc1cc900f2dde31e1dbc492d9661034f41 (patch)
tree	f037dd3d1bab6cbf08a562dbdd4c09361fe0c030
parent	9f817c6c70a749f2ac449ab4edfd26c6dd8a7410 (diff)
download	fatcat-bb28a3fc1cc900f2dde31e1dbc492d9661034f41.tar.gz fatcat-bb28a3fc1cc900f2dde31e1dbc492d9661034f41.zip