diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-08 16:28:27 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-08 16:28:27 -0800 | 
| commit | 16f2e78298dbd2231f5f337ea17c89a6a131a052 (patch) | |
| tree | 6e72581e625e73c97cbab72d0f9c35665c99e5d7 /python/fatcat_tools | |
| parent | eb40a5f274f3608db34309cfd16739a7642ef5e7 (diff) | |
| parent | ffb721f90c5d97ee80885209bf45feb85ca9625c (diff) | |
| download | fatcat-16f2e78298dbd2231f5f337ea17c89a6a131a052.tar.gz fatcat-16f2e78298dbd2231f5f337ea17c89a6a131a052.zip | |
Merge branch 'bnewbold-crude-auth'
Fixed a conflict in:
  python/fatcat_export.py
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/__init__.py | 3 | ||||
| -rw-r--r-- | python/fatcat_tools/api_auth.py | 40 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 36 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 19 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 13 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/issn.py | 10 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/matched.py | 18 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/orcid.py | 10 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/workers/changelog.py | 8 | ||||
| -rw-r--r-- | python/fatcat_tools/workers/worker_common.py | 8 | 
11 files changed, 131 insertions, 36 deletions
| diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index 0bb42ab5..e2b1e3a2 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -1,3 +1,4 @@ +from .api_auth import authenticated_api, public_api  from .fcid import fcid2uuid, uuid2fcid -from .transforms import entity_to_json, entity_from_json, release_to_elasticsearch +from .transforms import entity_to_dict, entity_from_json, release_to_elasticsearch diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py new file mode 100644 index 00000000..c49051f6 --- /dev/null +++ b/python/fatcat_tools/api_auth.py @@ -0,0 +1,40 @@ + +import os, sys +import fatcat_client +from fatcat_client.rest import ApiException + + +def public_api(host_uri): +    """ +    Note: unlike the authenticated variant, this helper might get called even +    if the API isn't going to be used, so it's important that it doesn't try to +    actually connect to the API host or something. +    """ +    conf = fatcat_client.Configuration() +    conf.host = host_uri +    return fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + +def authenticated_api(host_uri, token=None): +    """ +    Note: if this helper is called, it's implied that an actual API connection +    is needed, so it does try to connect and verify credentials. +    """ + +    conf = fatcat_client.Configuration() +    conf.host = host_uri +    if not token: +        token = os.environ['FATCAT_API_AUTH_TOKEN'] +    if not token: +        sys.stderr.write( +            'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n') +        sys.exit(-1) + +    conf.api_key["Authorization"] = token +    conf.api_key_prefix["Authorization"] = "Bearer" +    api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + +    # verify up front that auth is working +    api.auth_check() + +    return api + diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index e31cabf8..e39ec6c9 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -4,6 +4,7 @@ import sys  import csv  import json  import itertools +import subprocess  from collections import Counter  import pykafka @@ -37,19 +38,33 @@ class FatcatImporter:      Base class for fatcat importers      """ -    def __init__(self, host_url, issn_map_file=None): -        conf = fatcat_client.Configuration() -        conf.host = host_url -        self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) +    def __init__(self, api, **kwargs): + +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['git_rev'] = eg_extra.get('git_rev', +            subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter') +         +        self.api = api +        self._editgroup_description = kwargs.get('editgroup_description') +        self._editgroup_extra = kwargs.get('editgroup_extra') +        issn_map_file = kwargs.get('issn_map_file') +          self._issnl_id_map = dict()          self._orcid_id_map = dict()          self._doi_id_map = dict() -        self._issn_issnl_map = None -        self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")          if issn_map_file:              self.read_issn_map_file(issn_map_file) +        self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")          self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0}) +    def _editgroup(self): +        eg = fatcat_client.Editgroup( +            description=self._editgroup_description, +            extra=self._editgroup_extra, +        ) +        return self.api.create_editgroup(eg) +      def describe_run(self):          print("Processed {} lines, inserted {}, updated {}.".format(              self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) @@ -64,15 +79,13 @@ class FatcatImporter:      def process_source(self, source, group_size=100):          """Creates and auto-accepts editgroup every group_size rows""" -        eg = self.api.create_editgroup( -            fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +        eg = self._editgroup()          i = 0          for i, row in enumerate(source):              self.create_row(row, editgroup_id=eg.editgroup_id)              if i > 0 and (i % group_size) == 0:                  self.api.accept_editgroup(eg.editgroup_id) -                eg = self.api.create_editgroup( -                    fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +                eg = self._editgroup()              self.counts['processed_lines'] += 1          if i == 0 or (i % group_size) != 0:              self.api.accept_editgroup(eg.editgroup_id) @@ -83,8 +96,7 @@ class FatcatImporter:              if decode_kafka:                  rows = [msg.value.decode('utf-8') for msg in rows]              self.counts['processed_lines'] += len(rows) -            eg = self.api.create_editgroup( -                fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +            eg = self._editgroup()              self.create_batch(rows, editgroup_id=eg.editgroup_id)      def process_csv_source(self, source, group_size=100, delimiter=','): diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index d4d0de68..ed60a78c 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -4,6 +4,7 @@ import json  import sqlite3  import datetime  import itertools +import subprocess  import fatcat_client  from .common import FatcatImporter @@ -40,8 +41,19 @@ class CrossrefImporter(FatcatImporter):      See https://github.com/CrossRef/rest-api-doc for JSON schema notes      """ -    def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True, check_existing=True): -        super().__init__(host_url, issn_map_file) +    def __init__(self, api, issn_map_file, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of Crossref DOI metadata, harvested from REST API") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter') +        super().__init__(api, +            issn_map_file=issn_map_file, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra) +        extid_map_file = kwargs.get('extid_map_file') +        create_containers = kwargs.get('create_containers') +        check_existing = kwargs.get('check_existing')          self.extid_map_db = None          if extid_map_file:              db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -313,8 +325,7 @@ class CrossrefImporter(FatcatImporter):              if entities is not None:                  (re, ce) = entities                  if ce is not None: -                    ce_eg = self.api.create_editgroup( -                        fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) +                    ce_eg = self.api.create_editgroup(fatcat_client.Editgroup())                      container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id)                      self.api.accept_editgroup(ce_eg.editgroup_id)                      re.container_id = container.ident diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 2cb97b01..5e61a154 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -12,9 +12,16 @@ MAX_ABSTRACT_BYTES=4096  class GrobidMetadataImporter(FatcatImporter): -    def __init__(self, host_url, default_link_rel="web"): -        super().__init__(host_url) -        self.default_link_rel = default_link_rel +    def __init__(self, api, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Import of release and file metadata, as extracted from PDFs by GROBID.") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter') +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra) +        self.default_link_rel = kwargs.get("default_link_rel", "web")      def parse_grobid_json(self, obj): diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py index 9b9ca63f..02a1eea0 100644 --- a/python/fatcat_tools/importers/issn.py +++ b/python/fatcat_tools/importers/issn.py @@ -35,6 +35,16 @@ class IssnImporter(FatcatImporter):          ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count      """ +    def __init__(self, api, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter') +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra) +      def parse_issn_row(self, row):          """          row is a python dict (parsed from CSV). diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 5dbda27c..0b77bcf0 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -37,12 +37,18 @@ class MatchedImporter(FatcatImporter):      - core_id, wikidata_id, pmcid, pmid: not as lists      """ -    def __init__(self, host_url, skip_file_updates=False, default_mime=None, -            default_link_rel="web"): -        super().__init__(host_url) -        self.default_mime = default_mime -        self.default_link_rel = default_link_rel -        self.skip_file_updates = skip_file_updates +    def __init__(self, api, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Import of large-scale file-to-release match results. Source of metadata varies.") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter') +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra) +        self.default_link_rel = kwargs.get("default_link_rel", "web") +        self.default_mime = kwargs.get("default_mime", None) +        self.skip_file_updates = kwargs.get("skip_file_updates", False)      def make_url(self, raw):          rel = self.default_link_rel diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index fc4562d0..0aa4ab00 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -22,6 +22,16 @@ def value_or_none(e):  class OrcidImporter(FatcatImporter): +    def __init__(self, api, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of ORCID metadata, from official bulk releases.") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter') +        super().__init__(api, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra) +      def parse_orcid_dict(self, obj):          """          obj is a python dict (parsed from json). diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index 843c00a5..0f957f9a 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -2,7 +2,7 @@  import collections  from fatcat_client import ReleaseEntity, ApiClient -def entity_to_json(entity): +def entity_to_dict(entity):      """      Hack to take advantage of the code-generated serialization code      """ diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index b6d99d06..8690a791 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -12,11 +12,11 @@ class ChangelogWorker(FatcatWorker):      found, fetch them and push (as JSON) into a Kafka topic.      """ -    def __init__(self, api_host_url, kafka_hosts, produce_topic, poll_interval=10.0, offset=None): +    def __init__(self, api, kafka_hosts, produce_topic, poll_interval=10.0, offset=None):          # TODO: should be offset=0          super().__init__(kafka_hosts=kafka_hosts,                           produce_topic=produce_topic, -                         api_host_url=api_host_url) +                         api=api)          self.poll_interval = poll_interval          self.offset = offset    # the fatcat changelog offset, not the kafka offset @@ -61,10 +61,10 @@ class EntityUpdatesWorker(FatcatWorker):      For now, only release updates are published.      """ -    def __init__(self, api_host_url, kafka_hosts, consume_topic, release_topic): +    def __init__(self, api, kafka_hosts, consume_topic, release_topic):          super().__init__(kafka_hosts=kafka_hosts,                           consume_topic=consume_topic, -                         api_host_url=api_host_url) +                         api=api)          self.release_topic = release_topic          self.consumer_group = "entity-updates" diff --git a/python/fatcat_tools/workers/worker_common.py b/python/fatcat_tools/workers/worker_common.py index e400e815..b84341c7 100644 --- a/python/fatcat_tools/workers/worker_common.py +++ b/python/fatcat_tools/workers/worker_common.py @@ -45,11 +45,9 @@ class FatcatWorker:      Common code for for Kafka producers and consumers.      """ -    def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api_host_url=None): -        if api_host_url: -            conf = fatcat_client.Configuration() -            conf.host = api_host_url -            self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) +    def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api=None): +        if api: +            self.api = api          self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0")          self.produce_topic = produce_topic          self.consume_topic = consume_topic | 
