aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/__init__.py3
-rw-r--r--python/fatcat_tools/api_auth.py40
-rw-r--r--python/fatcat_tools/importers/common.py36
-rw-r--r--python/fatcat_tools/importers/crossref.py19
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py13
-rw-r--r--python/fatcat_tools/importers/issn.py10
-rw-r--r--python/fatcat_tools/importers/matched.py18
-rw-r--r--python/fatcat_tools/importers/orcid.py10
-rw-r--r--python/fatcat_tools/transforms.py2
-rw-r--r--python/fatcat_tools/workers/changelog.py8
-rw-r--r--python/fatcat_tools/workers/worker_common.py8
11 files changed, 131 insertions, 36 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
index 0bb42ab5..e2b1e3a2 100644
--- a/python/fatcat_tools/__init__.py
+++ b/python/fatcat_tools/__init__.py
@@ -1,3 +1,4 @@
+from .api_auth import authenticated_api, public_api
from .fcid import fcid2uuid, uuid2fcid
-from .transforms import entity_to_json, entity_from_json, release_to_elasticsearch
+from .transforms import entity_to_dict, entity_from_json, release_to_elasticsearch
diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py
new file mode 100644
index 00000000..c49051f6
--- /dev/null
+++ b/python/fatcat_tools/api_auth.py
@@ -0,0 +1,40 @@
+
+import os, sys
+import fatcat_client
+from fatcat_client.rest import ApiException
+
+
+def public_api(host_uri):
+ """
+ Note: unlike the authenticated variant, this helper might get called even
+ if the API isn't going to be used, so it's important that it doesn't try to
+ actually connect to the API host or something.
+ """
+ conf = fatcat_client.Configuration()
+ conf.host = host_uri
+ return fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+
+def authenticated_api(host_uri, token=None):
+ """
+ Note: if this helper is called, it's implied that an actual API connection
+ is needed, so it does try to connect and verify credentials.
+ """
+
+ conf = fatcat_client.Configuration()
+ conf.host = host_uri
+ if not token:
+ token = os.environ['FATCAT_API_AUTH_TOKEN']
+ if not token:
+ sys.stderr.write(
+ 'This client requires a fatcat API token (eg, in env var FATCAT_API_AUTH_TOKEN)\n')
+ sys.exit(-1)
+
+ conf.api_key["Authorization"] = token
+ conf.api_key_prefix["Authorization"] = "Bearer"
+ api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+
+ # verify up front that auth is working
+ api.auth_check()
+
+ return api
+
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index e31cabf8..e39ec6c9 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -4,6 +4,7 @@ import sys
import csv
import json
import itertools
+import subprocess
from collections import Counter
import pykafka
@@ -37,19 +38,33 @@ class FatcatImporter:
Base class for fatcat importers
"""
- def __init__(self, host_url, issn_map_file=None):
- conf = fatcat_client.Configuration()
- conf.host = host_url
- self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+ def __init__(self, api, **kwargs):
+
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['git_rev'] = eg_extra.get('git_rev',
+ subprocess.check_output(["git", "describe", "--always"]).strip()).decode('utf-8')
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.FatcatImporter')
+
+ self.api = api
+ self._editgroup_description = kwargs.get('editgroup_description')
+ self._editgroup_extra = kwargs.get('editgroup_extra')
+ issn_map_file = kwargs.get('issn_map_file')
+
self._issnl_id_map = dict()
self._orcid_id_map = dict()
self._doi_id_map = dict()
- self._issn_issnl_map = None
- self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")
if issn_map_file:
self.read_issn_map_file(issn_map_file)
+ self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$")
self.counts = Counter({'insert': 0, 'update': 0, 'processed_lines': 0})
+ def _editgroup(self):
+ eg = fatcat_client.Editgroup(
+ description=self._editgroup_description,
+ extra=self._editgroup_extra,
+ )
+ return self.api.create_editgroup(eg)
+
def describe_run(self):
print("Processed {} lines, inserted {}, updated {}.".format(
self.counts['processed_lines'], self.counts['insert'], self.counts['update']))
@@ -64,15 +79,13 @@ class FatcatImporter:
def process_source(self, source, group_size=100):
"""Creates and auto-accepts editgroup every group_size rows"""
- eg = self.api.create_editgroup(
- fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ eg = self._editgroup()
i = 0
for i, row in enumerate(source):
self.create_row(row, editgroup_id=eg.editgroup_id)
if i > 0 and (i % group_size) == 0:
self.api.accept_editgroup(eg.editgroup_id)
- eg = self.api.create_editgroup(
- fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ eg = self._editgroup()
self.counts['processed_lines'] += 1
if i == 0 or (i % group_size) != 0:
self.api.accept_editgroup(eg.editgroup_id)
@@ -83,8 +96,7 @@ class FatcatImporter:
if decode_kafka:
rows = [msg.value.decode('utf-8') for msg in rows]
self.counts['processed_lines'] += len(rows)
- eg = self.api.create_editgroup(
- fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ eg = self._editgroup()
self.create_batch(rows, editgroup_id=eg.editgroup_id)
def process_csv_source(self, source, group_size=100, delimiter=','):
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index d4d0de68..ed60a78c 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -4,6 +4,7 @@ import json
import sqlite3
import datetime
import itertools
+import subprocess
import fatcat_client
from .common import FatcatImporter
@@ -40,8 +41,19 @@ class CrossrefImporter(FatcatImporter):
See https://github.com/CrossRef/rest-api-doc for JSON schema notes
"""
- def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True, check_existing=True):
- super().__init__(host_url, issn_map_file)
+ def __init__(self, api, issn_map_file, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of Crossref DOI metadata, harvested from REST API")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter')
+ super().__init__(api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+ extid_map_file = kwargs.get('extid_map_file')
+ create_containers = kwargs.get('create_containers')
+ check_existing = kwargs.get('check_existing')
self.extid_map_db = None
if extid_map_file:
db_uri = "file:{}?mode=ro".format(extid_map_file)
@@ -313,8 +325,7 @@ class CrossrefImporter(FatcatImporter):
if entities is not None:
(re, ce) = entities
if ce is not None:
- ce_eg = self.api.create_editgroup(
- fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae'))
+ ce_eg = self.api.create_editgroup(fatcat_client.Editgroup())
container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id)
self.api.accept_editgroup(ce_eg.editgroup_id)
re.container_id = container.ident
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 2cb97b01..5e61a154 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -12,9 +12,16 @@ MAX_ABSTRACT_BYTES=4096
class GrobidMetadataImporter(FatcatImporter):
- def __init__(self, host_url, default_link_rel="web"):
- super().__init__(host_url)
- self.default_link_rel = default_link_rel
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Import of release and file metadata, as extracted from PDFs by GROBID.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.GrobidMetadataImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+ self.default_link_rel = kwargs.get("default_link_rel", "web")
def parse_grobid_json(self, obj):
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
index 9b9ca63f..02a1eea0 100644
--- a/python/fatcat_tools/importers/issn.py
+++ b/python/fatcat_tools/importers/issn.py
@@ -35,6 +35,16 @@ class IssnImporter(FatcatImporter):
ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
"""
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of container-level metadata, by ISSN. Metadata from Internet Archive munging.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IssnImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+
def parse_issn_row(self, row):
"""
row is a python dict (parsed from CSV).
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 5dbda27c..0b77bcf0 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -37,12 +37,18 @@ class MatchedImporter(FatcatImporter):
- core_id, wikidata_id, pmcid, pmid: not as lists
"""
- def __init__(self, host_url, skip_file_updates=False, default_mime=None,
- default_link_rel="web"):
- super().__init__(host_url)
- self.default_mime = default_mime
- self.default_link_rel = default_link_rel
- self.skip_file_updates = skip_file_updates
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Import of large-scale file-to-release match results. Source of metadata varies.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.MatchedImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+ self.default_link_rel = kwargs.get("default_link_rel", "web")
+ self.default_mime = kwargs.get("default_mime", None)
+ self.skip_file_updates = kwargs.get("skip_file_updates", False)
def make_url(self, raw):
rel = self.default_link_rel
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index fc4562d0..0aa4ab00 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -22,6 +22,16 @@ def value_or_none(e):
class OrcidImporter(FatcatImporter):
+ def __init__(self, api, **kwargs):
+
+ eg_desc = kwargs.get('editgroup_description',
+ "Automated import of ORCID metadata, from official bulk releases.")
+ eg_extra = kwargs.get('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.OrcidImporter')
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra)
+
def parse_orcid_dict(self, obj):
"""
obj is a python dict (parsed from json).
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index 843c00a5..0f957f9a 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -2,7 +2,7 @@
import collections
from fatcat_client import ReleaseEntity, ApiClient
-def entity_to_json(entity):
+def entity_to_dict(entity):
"""
Hack to take advantage of the code-generated serialization code
"""
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index b6d99d06..8690a791 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -12,11 +12,11 @@ class ChangelogWorker(FatcatWorker):
found, fetch them and push (as JSON) into a Kafka topic.
"""
- def __init__(self, api_host_url, kafka_hosts, produce_topic, poll_interval=10.0, offset=None):
+ def __init__(self, api, kafka_hosts, produce_topic, poll_interval=10.0, offset=None):
# TODO: should be offset=0
super().__init__(kafka_hosts=kafka_hosts,
produce_topic=produce_topic,
- api_host_url=api_host_url)
+ api=api)
self.poll_interval = poll_interval
self.offset = offset # the fatcat changelog offset, not the kafka offset
@@ -61,10 +61,10 @@ class EntityUpdatesWorker(FatcatWorker):
For now, only release updates are published.
"""
- def __init__(self, api_host_url, kafka_hosts, consume_topic, release_topic):
+ def __init__(self, api, kafka_hosts, consume_topic, release_topic):
super().__init__(kafka_hosts=kafka_hosts,
consume_topic=consume_topic,
- api_host_url=api_host_url)
+ api=api)
self.release_topic = release_topic
self.consumer_group = "entity-updates"
diff --git a/python/fatcat_tools/workers/worker_common.py b/python/fatcat_tools/workers/worker_common.py
index e400e815..b84341c7 100644
--- a/python/fatcat_tools/workers/worker_common.py
+++ b/python/fatcat_tools/workers/worker_common.py
@@ -45,11 +45,9 @@ class FatcatWorker:
Common code for for Kafka producers and consumers.
"""
- def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api_host_url=None):
- if api_host_url:
- conf = fatcat_client.Configuration()
- conf.host = api_host_url
- self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+ def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api=None):
+ if api:
+ self.api = api
self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0")
self.produce_topic = produce_topic
self.consume_topic = consume_topic