diff options
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/api_auth.py | 6 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 32 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/crossref.py | 16 | 
3 files changed, 40 insertions, 14 deletions
| diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py index b36d467c..c49051f6 100644 --- a/python/fatcat_tools/api_auth.py +++ b/python/fatcat_tools/api_auth.py @@ -1,5 +1,5 @@ -import sys +import os, sys  import fatcat_client  from fatcat_client.rest import ApiException @@ -23,7 +23,7 @@ def authenticated_api(host_uri, token=None):      conf = fatcat_client.Configuration()      conf.host = host_uri      if not token: -        token = sys.env['FATCAT_API_AUTH_TOKEN'] +        token = os.environ['FATCAT_API_AUTH_TOKEN']      if not token:          sys.stderr.write(              'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n') @@ -34,7 +34,7 @@ def authenticated_api(host_uri, token=None):      api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))      # verify up front that auth is working -    api.check_auth() +    api.auth_check()      return api diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 40c7abc0..5c33ebc9 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -4,6 +4,7 @@ import sys  import csv  import json  import itertools +import subprocess  from collections import Counter  import pykafka @@ -37,19 +38,32 @@ class FatcatImporter:      Base class for fatcat importers      """ -    def __init__(self, host_url, issn_map_file=None): -        conf = fatcat_client.Configuration() -        conf.host = host_url -        self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) +    def __init__(self, api, **kwargs): + +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['git_rev'] = eg_extra.get('git_rev', +            subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8') +         +        self.api = api +        self._editgroup_description = kwargs.get('editgroup_description') +        self._editgroup_extra = kwargs.get('editgroup_extra') +        issn_map_file = kwargs.get('issn_map_file') +          self._issnl_id_map = dict()          self._orcid_id_map = dict()          self._doi_id_map = dict() -        self._issn_issnl_map = None -        self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")          if issn_map_file:              self.read_issn_map_file(issn_map_file) +        self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")          self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0}) +    def _editgroup(self): +        eg = fatcat_client.Editgroup( +            description=self._editgroup_description, +            extra=self._editgroup_extra, +        ) +        return self.api.create_editgroup(eg) +      def describe_run(self):          print("Processed {} lines, inserted {}, updated {}.".format(              self.counts['processed_lines'], self.counts['insert'], self.counts['update'])) @@ -64,13 +78,13 @@ class FatcatImporter:      def process_source(self, source, group_size=100):          """Creates and auto-accepts editgroup every group_size rows""" -        eg = self.api.create_editgroup(fatcat_client.Editgroup()) +        eg = self._editgroup()          i = 0          for i, row in enumerate(source):              self.create_row(row, editgroup_id=eg.editgroup_id)              if i > 0 and (i % group_size) == 0:                  self.api.accept_editgroup(eg.editgroup_id) -                eg = self.api.create_editgroup(fatcat_client.Editgroup()) +                eg = self._editgroup()              self.counts['processed_lines'] += 1          if i == 0 or (i % group_size) != 0:              self.api.accept_editgroup(eg.editgroup_id) @@ -81,7 +95,7 @@ class FatcatImporter:              if decode_kafka:                  rows = [msg.value.decode('utf-8') for msg in rows]              self.counts['processed_lines'] += len(rows) -            eg = self.api.create_editgroup(fatcat_client.Editgroup()) +            eg = self._editgroup()              self.create_batch(rows, editgroup_id=eg.editgroup_id)      def process_csv_source(self, source, group_size=100, delimiter=','): diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 05543590..4f7faf59 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -4,6 +4,7 @@ import json  import sqlite3  import datetime  import itertools +import subprocess  import fatcat_client  from .common import FatcatImporter @@ -40,8 +41,19 @@ class CrossrefImporter(FatcatImporter):      See https://github.com/CrossRef/rest-api-doc for JSON schema notes      """ -    def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True, check_existing=True): -        super().__init__(host_url, issn_map_file) +    def __init__(self, api, issn_map_file, **kwargs): + +        eg_desc = kwargs.get('editgroup_description', +            "Automated import of Crossref DOI metadata, harvested from REST API") +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', 'CrossrefImporter') +        super().__init__(api, +            issn_map_file=issn_map_file, +            editgroup_description=eg_desc, +            editgroup_extra=eg_extra) +        extid_map_file = kwargs.get('extid_map_file') +        create_containers = kwargs.get('create_containers') +        check_existing = kwargs.get('check_existing')          self.extid_map_db = None          if extid_map_file:              db_uri = "file:{}?mode=ro".format(extid_map_file) | 
