summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/Pipfile9
-rw-r--r--python/README.md8
-rw-r--r--python/README_import.md12
-rwxr-xr-xpython/codegen_python_client.sh3
-rw-r--r--python/conftest.py5
-rwxr-xr-xpython/fatcat_export.py9
-rwxr-xr-xpython/fatcat_import.py76
-rw-r--r--python/fatcat_tools/__init__.py3
-rw-r--r--python/fatcat_tools/fcid.py6
-rw-r--r--python/fatcat_tools/importers/__init__.py7
-rw-r--r--python/fatcat_tools/importers/common.py3
-rw-r--r--python/fatcat_tools/importers/crossref.py12
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py4
-rw-r--r--python/fatcat_tools/importers/issn.py14
-rw-r--r--python/fatcat_tools/importers/matched.py10
-rw-r--r--python/fatcat_tools/importers/orcid.py4
-rw-r--r--python/fatcat_tools/transforms.py6
-rw-r--r--python/fatcat_tools/workers/__init__.py4
-rw-r--r--python/fatcat_tools/workers/changelog.py7
-rw-r--r--python/fatcat_tools/workers/elasticsearch.py (renamed from python/fatcat_tools/workers/elastic.py)28
-rw-r--r--python/fatcat_web/search.py4
-rwxr-xr-xpython/fatcat_worker.py24
-rw-r--r--python/tests/import_crossref.py4
-rw-r--r--python/tests/import_grobid_metadata.py4
-rw-r--r--python/tests/import_issn.py4
-rw-r--r--python/tests/import_matched.py4
-rw-r--r--python/tests/import_orcid.py4
-rw-r--r--python/tests/importer.py2
-rw-r--r--python/tests/transform_tests.py7
-rw-r--r--python/tox.ini3
-rw-r--r--python/uwsgi.ini3
-rw-r--r--python/web_config.py10
32 files changed, 196 insertions, 107 deletions
diff --git a/python/Pipfile b/python/Pipfile
index be62c111..f4137dca 100644
--- a/python/Pipfile
+++ b/python/Pipfile
@@ -1,3 +1,8 @@
+
+# This file is *not* used as part of bundling or distributing the python client
+# library (fatcat_client). It *is* shared by the web interface (flask app),
+# workers, and import scripts.
+
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
@@ -28,4 +33,8 @@ pykafka = "*"
python-dateutil = "*"
[requires]
+# Python 3.5 is the bundled (system) version of python for Ubuntu 16.04
+# (Xenial), currently the default on Internet Archive production VMs, as well
+# as Debian stable (stretch). Will probably bump to 3.6 in early 2019 when
+# updating both of these environments.
python_version = "3.5"
diff --git a/python/README.md b/python/README.md
index 9b244e2d..0cfbab21 100644
--- a/python/README.md
+++ b/python/README.md
@@ -23,10 +23,10 @@ locally):
## Web Interface
This project uses `pipenv` to manage dependencies, and assumes Python 3.5
-(which pipenv may install if you are running a different local version). You
-can can install `pipenv` with `pip`. You may want to set the
-`PIPENV_VENV_IN_PROJECT` environment variable on your development machine (see
-pipenv docs for details).
+(which pipenv may install if you are running a different local version; see
+notes in Pipfile). You can can install `pipenv` with `pip`. You may want to set
+the `PIPENV_VENV_IN_PROJECT` environment variable on your development machine
+(see pipenv docs for details).
To just run the web interface (which will try to connect to a back-end API
server on the same machine by default), use:
diff --git a/python/README_import.md b/python/README_import.md
index 0264610b..6334dbc6 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -36,7 +36,7 @@ the others:
From CSV file:
# See "start off with" command above
- time ./fatcat_import.py import-issn /srv/fatcat/datasets/journal_extra_metadata.csv
+ time ./fatcat_import.py issn /srv/fatcat/datasets/journal_extra_metadata.csv
Usually a couple minutes at most on fast production machine.
@@ -44,23 +44,23 @@ Usually a couple minutes at most on fast production machine.
Usually tens of minutes on fast production machine.
- time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
+ time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py orcid -
## Crossref
Usually 24 hours or so on fast production machine.
- time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
+ time xzcat /srv/fatcat/datasets/crossref-works.2018-09-05.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3
## Matched
Unknown speed!
# No file update for the first import...
- zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update -
+ zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-update -
# ... but do on the second
- zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
+ zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -
# GROBID extracted (release+file)
- time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-grobid-metadata -
+ time zcat /srv/fatcat/datasets/2018-09-23-0405.30-dumpgrobidmetainsertable.longtail_join.filtered.tsv.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py grobid-metadata -
diff --git a/python/codegen_python_client.sh b/python/codegen_python_client.sh
index 4c146f3f..85d12dc4 100755
--- a/python/codegen_python_client.sh
+++ b/python/codegen_python_client.sh
@@ -1,5 +1,8 @@
#!/bin/bash
+# This script re-generates the fatcat API client (fatcat_client) from the
+# swagger/openapi2 spec file, using automated tools ("codegen")
+
set -exu
set -o pipefail
diff --git a/python/conftest.py b/python/conftest.py
index aad02e76..c2a31562 100644
--- a/python/conftest.py
+++ b/python/conftest.py
@@ -1,3 +1,8 @@
+"""
+This file exists soley to prevent pytest from trying to import/test the
+fatcat_client ./setup.py file.
+"""
+
import sys
collect_ignore = ["setup.py"]
diff --git a/python/fatcat_export.py b/python/fatcat_export.py
index eadf69ab..60c1caf1 100755
--- a/python/fatcat_export.py
+++ b/python/fatcat_export.py
@@ -1,11 +1,18 @@
#!/usr/bin/env python3
+"""
+Note: this is *not* the tool used to generate "official" metadata dumps; that
+tool is written in rust and runs on the production infrastructure for speed.
+These scripts are just a demonstration of how the API *could* be scraped
+without permission by an third party.
+"""
+
import sys
import json
import argparse
import fatcat_client
from fatcat_client.rest import ApiException
-from fatcat_tools.fcid import uuid2fcid
+from fatcat_tools import uuid2fcid
def run_export_releases(args):
conf = fatcat_client.Configuration()
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 0ec0cfa8..a5527b8c 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -2,36 +2,34 @@
import sys
import argparse
-from fatcat_tools.importers.crossref import FatcatCrossrefImporter
-from fatcat_tools.importers.orcid import FatcatOrcidImporter
-from fatcat_tools.importers.issn import FatcatIssnImporter
-from fatcat_tools.importers.matched import FatcatMatchedImporter
-from fatcat_tools.importers.grobid_metadata import FatcatGrobidMetadataImporter
+from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \
+ IssnImporter, MatchedImporter, GrobidMetadataImporter
-def run_import_crossref(args):
- fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file,
+
+def run_crossref(args):
+ fci = CrossrefImporter(args.host_url, args.issn_map_file,
args.extid_map_file, create_containers=(not args.no_create_containers))
fci.process_batch(args.json_file, size=args.batch_size)
fci.describe_run()
-def run_import_orcid(args):
- foi = FatcatOrcidImporter(args.host_url)
+def run_orcid(args):
+ foi = OrcidImporter(args.host_url)
foi.process_batch(args.json_file, size=args.batch_size)
foi.describe_run()
-def run_import_issn(args):
- fii = FatcatIssnImporter(args.host_url)
+def run_issn(args):
+ fii = IssnImporter(args.host_url)
fii.process_csv_batch(args.csv_file, size=args.batch_size)
fii.describe_run()
-def run_import_matched(args):
- fmi = FatcatMatchedImporter(args.host_url,
+def run_matched(args):
+ fmi = MatchedImporter(args.host_url,
skip_file_update=args.no_file_update)
fmi.process_batch(args.json_file, size=args.batch_size)
fmi.describe_run()
-def run_import_grobid_metadata(args):
- fmi = FatcatGrobidMetadataImporter(args.host_url)
+def run_grobid_metadata(args):
+ fmi = GrobidMetadataImporter(args.host_url)
fmi.process_source(args.tsv_file, group_size=args.group_size)
fmi.describe_run()
@@ -45,60 +43,60 @@ def main():
help="connect to this host/port")
subparsers = parser.add_subparsers()
- sub_import_crossref = subparsers.add_parser('import-crossref')
- sub_import_crossref.set_defaults(func=run_import_crossref)
- sub_import_crossref.add_argument('json_file',
+ sub_crossref = subparsers.add_parser('crossref')
+ sub_crossref.set_defaults(func=run_crossref)
+ sub_crossref.add_argument('json_file',
help="crossref JSON file to import from",
default=sys.stdin, type=argparse.FileType('r'))
- sub_import_crossref.add_argument('issn_map_file',
+ sub_crossref.add_argument('issn_map_file',
help="ISSN to ISSN-L mapping file",
default=None, type=argparse.FileType('r'))
- sub_import_crossref.add_argument('extid_map_file',
+ sub_crossref.add_argument('extid_map_file',
help="DOI-to-other-identifiers sqlite3 database",
default=None, type=str)
- sub_import_crossref.add_argument('--no-create-containers',
+ sub_crossref.add_argument('--no-create-containers',
action='store_true',
help="skip creation of new container entities based on ISSN")
- sub_import_crossref.add_argument('--batch-size',
+ sub_crossref.add_argument('--batch-size',
help="size of batch to send",
default=50, type=int)
- sub_import_orcid = subparsers.add_parser('import-orcid')
- sub_import_orcid.set_defaults(func=run_import_orcid)
- sub_import_orcid.add_argument('json_file',
+ sub_orcid = subparsers.add_parser('orcid')
+ sub_orcid.set_defaults(func=run_orcid)
+ sub_orcid.add_argument('json_file',
help="orcid JSON file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_import_orcid.add_argument('--batch-size',
+ sub_orcid.add_argument('--batch-size',
help="size of batch to send",
default=50, type=int)
- sub_import_issn = subparsers.add_parser('import-issn')
- sub_import_issn.set_defaults(func=run_import_issn)
- sub_import_issn.add_argument('csv_file',
+ sub_issn = subparsers.add_parser('issn')
+ sub_issn.set_defaults(func=run_issn)
+ sub_issn.add_argument('csv_file',
help="Journal ISSN CSV metadata file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_import_issn.add_argument('--batch-size',
+ sub_issn.add_argument('--batch-size',
help="size of batch to send",
default=50, type=int)
- sub_import_matched = subparsers.add_parser('import-matched')
- sub_import_matched.set_defaults(func=run_import_matched)
- sub_import_matched.add_argument('json_file',
+ sub_matched = subparsers.add_parser('matched')
+ sub_matched.set_defaults(func=run_matched)
+ sub_matched.add_argument('json_file',
help="JSON file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_import_matched.add_argument('--no-file-update',
+ sub_matched.add_argument('--no-file-update',
action='store_true',
help="don't lookup existing files, just insert (only for bootstrap)")
- sub_import_matched.add_argument('--batch-size',
+ sub_matched.add_argument('--batch-size',
help="size of batch to send",
default=50, type=int)
- sub_import_grobid_metadata = subparsers.add_parser('import-grobid-metadata')
- sub_import_grobid_metadata.set_defaults(func=run_import_grobid_metadata)
- sub_import_grobid_metadata.add_argument('tsv_file',
+ sub_grobid_metadata = subparsers.add_parser('grobid-metadata')
+ sub_grobid_metadata.set_defaults(func=run_grobid_metadata)
+ sub_grobid_metadata.add_argument('tsv_file',
help="TSV file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_import_grobid_metadata.add_argument('--group-size',
+ sub_grobid_metadata.add_argument('--group-size',
help="editgroup group size to use",
default=75, type=int)
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
new file mode 100644
index 00000000..0bb42ab5
--- /dev/null
+++ b/python/fatcat_tools/__init__.py
@@ -0,0 +1,3 @@
+
+from .fcid import fcid2uuid, uuid2fcid
+from .transforms import entity_to_json, entity_from_json, release_to_elasticsearch
diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py
index dd72b242..4194ea63 100644
--- a/python/fatcat_tools/fcid.py
+++ b/python/fatcat_tools/fcid.py
@@ -3,12 +3,18 @@ import base64
import uuid
def fcid2uuid(s):
+ """
+ Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object
+ """
s = s.split('_')[-1].upper().encode('utf-8')
assert len(s) == 26
raw = base64.b32decode(s + b"======")
return str(uuid.UUID(bytes=raw)).lower()
def uuid2fcid(s):
+ """
+ Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)
+ """
raw = uuid.UUID(s).bytes
return base64.b32encode(raw)[:26].lower().decode('utf-8')
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
new file mode 100644
index 00000000..0f5fafb6
--- /dev/null
+++ b/python/fatcat_tools/importers/__init__.py
@@ -0,0 +1,7 @@
+
+from .common import FatcatImporter
+from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .grobid_metadata import GrobidMetadataImporter
+from .issn import IssnImporter
+from .matched import MatchedImporter
+from .orcid import OrcidImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index d289171d..9cf92b41 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -16,6 +16,9 @@ def grouper(iterable, n, fillvalue=None):
return itertools.zip_longest(*args, fillvalue=fillvalue)
class FatcatImporter:
+ """
+ Base class for fatcat importers
+ """
def __init__(self, host_url, issn_map_file=None):
conf = fatcat_client.Configuration()
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index fe80c2d3..fac8f32b 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -5,9 +5,11 @@ import sqlite3
import datetime
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
+# The docs/guide should be the cannonical home for these mappings; update there
+# first
CROSSREF_TYPE_MAP = {
'book': 'book',
'book-chapter': 'chapter',
@@ -29,8 +31,14 @@ CROSSREF_TYPE_MAP = {
'standard': 'standard',
}
+class CrossrefImporter(FatcatImporter):
+ """
+ Importer for Crossref metadata.
-class FatcatCrossrefImporter(FatcatImporter):
+ Can use a local sqlite3 file for faster "external identifier" lookups
+
+ See https://github.com/CrossRef/rest-api-doc for JSON schema notes
+ """
def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):
super().__init__(host_url, issn_map_file)
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index dedc9728..ba8a4e6f 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,12 @@ import json
import base64
import datetime
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
MAX_ABSTRACT_BYTES=4096
-class FatcatGrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(FatcatImporter):
def __init__(self, host_url, default_link_rel="web"):
super().__init__(host_url)
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
index ba8492c6..0b0efccb 100644
--- a/python/fatcat_tools/importers/issn.py
+++ b/python/fatcat_tools/importers/issn.py
@@ -3,10 +3,8 @@ import sys
import json
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
-# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
def or_none(s):
if s is None:
@@ -26,7 +24,15 @@ def truthy(s):
else:
return None
-class FatcatIssnImporter(FatcatImporter):
+class IssnImporter(FatcatImporter):
+ """
+ Imports journal metadata ("containers") by ISSN, currently from a custom
+ (data munged) .csv file format
+
+ CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+
+ ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+ """
def parse_issn_row(self, row):
"""
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 774019c7..732fccbe 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,7 +4,7 @@ import json
import sqlite3
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
#row = row.split('\t')
#assert len(row) == 2
@@ -13,8 +13,14 @@ from fatcat_tools.importers.common import FatcatImporter
#print(sha1)
#dois = [d.lower() for d in json.loads(row[1])]
-class FatcatMatchedImporter(FatcatImporter):
+class MatchedImporter(FatcatImporter):
"""
+ Importer for "file to crossref DOI" matches.
+
+ These matches are currently generated by Internet Archive hadoop jobs
+ written in scala (part of the 'sandcrawler' repo/project), but could be
+ generated by other parties as well.
+
Input format is JSON with keys:
- dois (list)
- sha1 (hex)
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 527316dd..9e4767f9 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
import json
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
def value_or_none(e):
if type(e) == dict:
@@ -20,7 +20,7 @@ def value_or_none(e):
return None
return e
-class FatcatOrcidImporter(FatcatImporter):
+class OrcidImporter(FatcatImporter):
def parse_orcid_dict(self, obj):
"""
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index e10c6ba5..147acd7c 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -3,6 +3,9 @@ import collections
from fatcat_client import ReleaseEntity, ApiClient
def entity_to_json(entity):
+ """
+ Hack to take advantage of the code-generated serialization code
+ """
ac = ApiClient()
return ac.sanitize_for_serialization(entity)
@@ -15,11 +18,12 @@ def entity_from_json(json_str, entity_type):
thing.data = json_str
return ac.deserialize(thing, entity_type)
-def release_elastic_dict(release):
+def release_to_elasticsearch(release):
"""
Converts from an entity model/schema to elasticsearch oriented schema.
Returns: dict
+ Raises exception on error (never returns None)
"""
if release.state != 'active':
diff --git a/python/fatcat_tools/workers/__init__.py b/python/fatcat_tools/workers/__init__.py
new file mode 100644
index 00000000..e8973bc3
--- /dev/null
+++ b/python/fatcat_tools/workers/__init__.py
@@ -0,0 +1,4 @@
+
+from .changelog import ChangelogWorker, EntityUpdatesWorker
+from .elasticsearch import ElasticsearchReleaseWorker
+from .worker_common import most_recent_message, FatcatWorker
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index d07b5988..e64c043b 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -1,11 +1,12 @@
import json
import time
-from fatcat_tools.workers.worker_common import FatcatWorker, most_recent_message
from pykafka.common import OffsetType
+from .worker_common import FatcatWorker, most_recent_message
-class FatcatChangelogWorker(FatcatWorker):
+
+class ChangelogWorker(FatcatWorker):
"""
Periodically polls the fatcat API looking for new changelogs. When they are
found, fetch them and push (as JSON) into a Kafka topic.
@@ -52,7 +53,7 @@ class FatcatChangelogWorker(FatcatWorker):
time.sleep(self.poll_interval)
-class FatcatEntityUpdatesWorker(FatcatWorker):
+class EntityUpdatesWorker(FatcatWorker):
"""
Consumes from the changelog topic and publishes expanded entities (fetched
from API) to update topics.
diff --git a/python/fatcat_tools/workers/elastic.py b/python/fatcat_tools/workers/elasticsearch.py
index 3a75a1b3..e7abd5ee 100644
--- a/python/fatcat_tools/workers/elastic.py
+++ b/python/fatcat_tools/workers/elasticsearch.py
@@ -2,14 +2,14 @@
import json
import time
import requests
-from fatcat_tools.transforms import release_elastic_dict
-from fatcat_tools.workers.worker_common import FatcatWorker
-from fatcat_client import ReleaseEntity
-from fatcat_tools.transforms import *
from pykafka.common import OffsetType
+from fatcat_client import ReleaseEntity
+from fatcat_tools import *
+from .worker_common import FatcatWorker
+
-class FatcatElasticReleaseWorker(FatcatWorker):
+class ElasticsearchReleaseWorker(FatcatWorker):
"""
Consumes from release-updates topic and pushes into (presumably local)
elasticsearch.
@@ -18,13 +18,13 @@ class FatcatElasticReleaseWorker(FatcatWorker):
"""
def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
- elastic_backend="http://localhost:9200", elastic_index="fatcat"):
+ elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):
super().__init__(kafka_hosts=kafka_hosts,
consume_topic=consume_topic,
api_host_url=None)
- self.consumer_group = "elastic-updates"
- self.elastic_backend = elastic_backend
- self.elastic_index = elastic_index
+ self.consumer_group = "elasticsearch-updates"
+ self.elasticsearch_backend = elasticsearch_backend
+ self.elasticsearch_index = elasticsearch_index
def run(self):
consume_topic = self.kafka.topics[self.consume_topic]
@@ -42,11 +42,11 @@ class FatcatElasticReleaseWorker(FatcatWorker):
json_str = msg.value.decode('utf-8')
release = entity_from_json(json_str, ReleaseEntity)
#print(release)
- elastic_endpoint = "{}/{}/release/{}".format(
- self.elastic_backend,
- self.elastic_index,
+ elasticsearch_endpoint = "{}/{}/release/{}".format(
+ self.elasticsearch_backend,
+ self.elasticsearch_index,
release.ident)
- print("Updating document: {}".format(elastic_endpoint))
- resp = requests.post(elastic_endpoint, json=release_elastic_dict(release))
+ print("Updating document: {}".format(elasticsearch_endpoint))
+ resp = requests.post(elasticsearch_endpoint, json=release_to_elasticsearch(release))
assert resp.status_code in (200, 201)
#consumer.commit_offsets()
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
index 7fcdabea..0a3ea40e 100644
--- a/python/fatcat_web/search.py
+++ b/python/fatcat_web/search.py
@@ -3,6 +3,10 @@ import requests
from flask import abort
from fatcat_web import app
+"""
+Helpers for doing elasticsearch queries (used in the web interface; not part of
+the formal API)
+"""
def do_search(q, limit=50, fulltext_only=True):
diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py
index 4c52d2c1..2f883fe0 100755
--- a/python/fatcat_worker.py
+++ b/python/fatcat_worker.py
@@ -3,27 +3,27 @@
import sys
import argparse
import datetime
-from fatcat_tools.workers.changelog import FatcatChangelogWorker, FatcatEntityUpdatesWorker
-from fatcat_tools.workers.elastic import FatcatElasticReleaseWorker
+from fatcat_tools.workers import ChangelogWorker, EntityUpdatesWorker, ElasticsearchReleaseWorker
+
def run_changelog(args):
topic = "fatcat-{}.changelog".format(args.env)
- worker = FatcatChangelogWorker(args.api_host_url, args.kafka_hosts, topic,
+ worker = ChangelogWorker(args.api_host_url, args.kafka_hosts, topic,
args.poll_interval)
worker.run()
def run_entity_updates(args):
changelog_topic = "fatcat-{}.changelog".format(args.env)
release_topic = "fatcat-{}.release-updates".format(args.env)
- worker = FatcatEntityUpdatesWorker(args.api_host_url, args.kafka_hosts,
+ worker = EntityUpdatesWorker(args.api_host_url, args.kafka_hosts,
changelog_topic, release_topic)
worker.run()
-def run_elastic_release(args):
+def run_elasticsearch_release(args):
consume_topic = "fatcat-{}.release-updates".format(args.env)
- worker = FatcatElasticReleaseWorker(args.kafka_hosts,
- consume_topic, elastic_backend=args.elastic_backend,
- elastic_index=args.elastic_index)
+ worker = ReleaseWorker(args.kafka_hosts,
+ consume_topic, elasticsearch_backend=args.elasticsearch_backend,
+ elasticsearch_index=args.elasticsearch_index)
worker.run()
def main():
@@ -51,12 +51,12 @@ def main():
sub_entity_updates = subparsers.add_parser('entity-updates')
sub_entity_updates.set_defaults(func=run_entity_updates)
- sub_elastic_release = subparsers.add_parser('elastic-release')
- sub_elastic_release.set_defaults(func=run_elastic_release)
- sub_elastic_release.add_argument('--elastic-backend',
+ sub_elasticsearch_release = subparsers.add_parser('elasticsearch-release')
+ sub_elasticsearch_release.set_defaults(func=run_elasticsearch_release)
+ sub_elasticsearch_release.add_argument('--elasticsearch-backend',
help="elasticsearch backend to connect to",
default="http://localhost:9200")
- sub_elastic_release.add_argument('--elastic-index',
+ sub_elasticsearch_release.add_argument('--elasticsearch-index',
help="elasticsearch index to push into",
default="fatcat")
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index 078db184..c129e729 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -1,13 +1,13 @@
import json
import pytest
-from fatcat_tools.importers.crossref import FatcatCrossrefImporter
+from fatcat_tools.importers import CrossrefImporter
@pytest.fixture(scope="function")
def crossref_importer():
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield FatcatCrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3')
+ yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3')
def test_crossref_importer_batch(crossref_importer):
with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
diff --git a/python/tests/import_grobid_metadata.py b/python/tests/import_grobid_metadata.py
index 8b268e21..ee7040c9 100644
--- a/python/tests/import_grobid_metadata.py
+++ b/python/tests/import_grobid_metadata.py
@@ -3,7 +3,7 @@ import os
import json
import base64
import pytest
-from fatcat_tools.importers.grobid_metadata import FatcatGrobidMetadataImporter
+from fatcat_tools.importers import GrobidMetadataImporter
"""
WARNING: these tests are currently very fragile because they have database
@@ -12,7 +12,7 @@ side-effects. Should probably be disabled or re-written.
@pytest.fixture(scope="function")
def grobid_metadata_importer():
- yield FatcatGrobidMetadataImporter("http://localhost:9411/v0")
+ yield GrobidMetadataImporter("http://localhost:9411/v0")
# TODO: use API to check that entities actually created...
#def test_grobid_metadata_importer_batch(grobid_metadata_importer):
diff --git a/python/tests/import_issn.py b/python/tests/import_issn.py
index f45747ed..98a9f4a7 100644
--- a/python/tests/import_issn.py
+++ b/python/tests/import_issn.py
@@ -1,11 +1,11 @@
import pytest
-from fatcat_tools.importers.issn import FatcatIssnImporter
+from fatcat_tools.importers import IssnImporter
@pytest.fixture(scope="function")
def issn_importer():
- yield FatcatIssnImporter("http://localhost:9411/v0")
+ yield IssnImporter("http://localhost:9411/v0")
# TODO: use API to check that entities actually created...
def test_issn_importer_batch(issn_importer):
diff --git a/python/tests/import_matched.py b/python/tests/import_matched.py
index 8004e3bd..85e21267 100644
--- a/python/tests/import_matched.py
+++ b/python/tests/import_matched.py
@@ -1,12 +1,12 @@
import json
import pytest
-from fatcat_tools.importers.matched import FatcatMatchedImporter
+from fatcat_tools.importers import MatchedImporter
@pytest.fixture(scope="function")
def matched_importer():
- yield FatcatMatchedImporter("http://localhost:9411/v0")
+ yield MatchedImporter("http://localhost:9411/v0")
# TODO: use API to check that entities actually created...
def test_matched_importer_batch(matched_importer):
diff --git a/python/tests/import_orcid.py b/python/tests/import_orcid.py
index 2dc98d76..18199888 100644
--- a/python/tests/import_orcid.py
+++ b/python/tests/import_orcid.py
@@ -1,12 +1,12 @@
import json
import pytest
-from fatcat_tools.importers.orcid import FatcatOrcidImporter
+from fatcat_tools.importers import OrcidImporter
@pytest.fixture(scope="function")
def orcid_importer():
- yield FatcatOrcidImporter("http://localhost:9411/v0")
+ yield OrcidImporter("http://localhost:9411/v0")
# TODO: use API to check that entities actually created...
def test_orcid_importer_batch(orcid_importer):
diff --git a/python/tests/importer.py b/python/tests/importer.py
index d98638e4..f228a9b2 100644
--- a/python/tests/importer.py
+++ b/python/tests/importer.py
@@ -1,7 +1,7 @@
import pytest
-from fatcat_tools.importers.common import FatcatImporter
+from fatcat_tools.importers import FatcatImporter
def test_issnl_mapping_lookup():
diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py
index 52a9965a..18522eff 100644
--- a/python/tests/transform_tests.py
+++ b/python/tests/transform_tests.py
@@ -1,15 +1,14 @@
import json
import pytest
-from fatcat_tools.importers.crossref import FatcatCrossrefImporter
-from fatcat_tools.transforms import *
+from fatcat_tools import *
from import_crossref import crossref_importer
-def test_elastic_convert(crossref_importer):
+def test_elasticsearch_convert(crossref_importer):
with open('tests/files/crossref-works.single.json', 'r') as f:
# not a single line
raw = json.loads(f.read())
(r, c) = crossref_importer.parse_crossref_dict(raw)
r.state = 'active'
- release_elastic_dict(r)
+ release_to_elasticsearch(r)
diff --git a/python/tox.ini b/python/tox.ini
index d4fc1d9d..f9d811cd 100644
--- a/python/tox.ini
+++ b/python/tox.ini
@@ -1,3 +1,6 @@
+# tox isn't actually used for anything right now, but might be a better tool
+# for CI or for testing compatibility with multiple python versions
+
[tox]
envlist = py35
diff --git a/python/uwsgi.ini b/python/uwsgi.ini
index fdf021b6..a03f72bb 100644
--- a/python/uwsgi.ini
+++ b/python/uwsgi.ini
@@ -1,3 +1,6 @@
+# uwsgi is used in production deployment, in combination with pipenv (which
+# generates the .venv file)
+
[uwsgi]
plugin = python3,http
http = :9810
diff --git a/python/web_config.py b/python/web_config.py
index ed6d830a..749d0586 100644
--- a/python/web_config.py
+++ b/python/web_config.py
@@ -1,4 +1,14 @@
+"""
+Default configuration for fatcat web interface (Flask application).
+
+In production, we currently reconfigure these values using environment
+variables, not by (eg) deploying a variant copy of this file.
+
+This config is *only* for the web interface, *not* for any of the workers or
+import scripts.
+"""
+
import os
import subprocess