aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-11-15 13:11:52 -0800
committerBryan Newbold <bnewbold@robocracy.org>2018-11-15 13:15:15 -0800
commitbb28a3fc1cc900f2dde31e1dbc492d9661034f41 (patch)
treef037dd3d1bab6cbf08a562dbdd4c09361fe0c030 /python/fatcat_tools
parent9f817c6c70a749f2ac449ab4edfd26c6dd8a7410 (diff)
downloadfatcat-bb28a3fc1cc900f2dde31e1dbc492d9661034f41.tar.gz
fatcat-bb28a3fc1cc900f2dde31e1dbc492d9661034f41.zip
large refactor of python names/paths
- Add __init__.py files for fatcat_tools submodules, and use them in imports - Add a bunch of comments to files. - rename a number of classes and functions to be less verbose
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/__init__.py3
-rw-r--r--python/fatcat_tools/fcid.py6
-rw-r--r--python/fatcat_tools/importers/__init__.py7
-rw-r--r--python/fatcat_tools/importers/common.py3
-rw-r--r--python/fatcat_tools/importers/crossref.py12
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py4
-rw-r--r--python/fatcat_tools/importers/issn.py14
-rw-r--r--python/fatcat_tools/importers/matched.py10
-rw-r--r--python/fatcat_tools/importers/orcid.py4
-rw-r--r--python/fatcat_tools/transforms.py6
-rw-r--r--python/fatcat_tools/workers/__init__.py4
-rw-r--r--python/fatcat_tools/workers/changelog.py7
-rw-r--r--python/fatcat_tools/workers/elasticsearch.py (renamed from python/fatcat_tools/workers/elastic.py)28
13 files changed, 78 insertions, 30 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
new file mode 100644
index 00000000..0bb42ab5
--- /dev/null
+++ b/python/fatcat_tools/__init__.py
@@ -0,0 +1,3 @@
+
+from .fcid import fcid2uuid, uuid2fcid
+from .transforms import entity_to_json, entity_from_json, release_to_elasticsearch
diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py
index dd72b242..4194ea63 100644
--- a/python/fatcat_tools/fcid.py
+++ b/python/fatcat_tools/fcid.py
@@ -3,12 +3,18 @@ import base64
import uuid
def fcid2uuid(s):
+ """
+ Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object
+ """
s = s.split('_')[-1].upper().encode('utf-8')
assert len(s) == 26
raw = base64.b32decode(s + b"======")
return str(uuid.UUID(bytes=raw)).lower()
def uuid2fcid(s):
+ """
+ Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)
+ """
raw = uuid.UUID(s).bytes
return base64.b32encode(raw)[:26].lower().decode('utf-8')
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
new file mode 100644
index 00000000..0f5fafb6
--- /dev/null
+++ b/python/fatcat_tools/importers/__init__.py
@@ -0,0 +1,7 @@
+
+from .common import FatcatImporter
+from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP
+from .grobid_metadata import GrobidMetadataImporter
+from .issn import IssnImporter
+from .matched import MatchedImporter
+from .orcid import OrcidImporter
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index d289171d..9cf92b41 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -16,6 +16,9 @@ def grouper(iterable, n, fillvalue=None):
return itertools.zip_longest(*args, fillvalue=fillvalue)
class FatcatImporter:
+ """
+ Base class for fatcat importers
+ """
def __init__(self, host_url, issn_map_file=None):
conf = fatcat_client.Configuration()
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index fe80c2d3..fac8f32b 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -5,9 +5,11 @@ import sqlite3
import datetime
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
+# The docs/guide should be the cannonical home for these mappings; update there
+# first
CROSSREF_TYPE_MAP = {
'book': 'book',
'book-chapter': 'chapter',
@@ -29,8 +31,14 @@ CROSSREF_TYPE_MAP = {
'standard': 'standard',
}
+class CrossrefImporter(FatcatImporter):
+ """
+ Importer for Crossref metadata.
-class FatcatCrossrefImporter(FatcatImporter):
+ Can use a local sqlite3 file for faster "external identifier" lookups
+
+ See https://github.com/CrossRef/rest-api-doc for JSON schema notes
+ """
def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):
super().__init__(host_url, issn_map_file)
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index dedc9728..ba8a4e6f 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -5,12 +5,12 @@ import json
import base64
import datetime
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
MAX_ABSTRACT_BYTES=4096
-class FatcatGrobidMetadataImporter(FatcatImporter):
+class GrobidMetadataImporter(FatcatImporter):
def __init__(self, host_url, default_link_rel="web"):
super().__init__(host_url)
diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py
index ba8492c6..0b0efccb 100644
--- a/python/fatcat_tools/importers/issn.py
+++ b/python/fatcat_tools/importers/issn.py
@@ -3,10 +3,8 @@ import sys
import json
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
-# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
-# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
def or_none(s):
if s is None:
@@ -26,7 +24,15 @@ def truthy(s):
else:
return None
-class FatcatIssnImporter(FatcatImporter):
+class IssnImporter(FatcatImporter):
+ """
+ Imports journal metadata ("containers") by ISSN, currently from a custom
+ (data munged) .csv file format
+
+ CSV format (generated from git.archive.org/webgroup/oa-journal-analysis):
+
+ ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count
+ """
def parse_issn_row(self, row):
"""
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 774019c7..732fccbe 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -4,7 +4,7 @@ import json
import sqlite3
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
#row = row.split('\t')
#assert len(row) == 2
@@ -13,8 +13,14 @@ from fatcat_tools.importers.common import FatcatImporter
#print(sha1)
#dois = [d.lower() for d in json.loads(row[1])]
-class FatcatMatchedImporter(FatcatImporter):
+class MatchedImporter(FatcatImporter):
"""
+ Importer for "file to crossref DOI" matches.
+
+ These matches are currently generated by Internet Archive hadoop jobs
+ written in scala (part of the 'sandcrawler' repo/project), but could be
+ generated by other parties as well.
+
Input format is JSON with keys:
- dois (list)
- sha1 (hex)
diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py
index 527316dd..9e4767f9 100644
--- a/python/fatcat_tools/importers/orcid.py
+++ b/python/fatcat_tools/importers/orcid.py
@@ -3,7 +3,7 @@ import sys
import json
import itertools
import fatcat_client
-from fatcat_tools.importers.common import FatcatImporter
+from .common import FatcatImporter
def value_or_none(e):
if type(e) == dict:
@@ -20,7 +20,7 @@ def value_or_none(e):
return None
return e
-class FatcatOrcidImporter(FatcatImporter):
+class OrcidImporter(FatcatImporter):
def parse_orcid_dict(self, obj):
"""
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py
index e10c6ba5..147acd7c 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms.py
@@ -3,6 +3,9 @@ import collections
from fatcat_client import ReleaseEntity, ApiClient
def entity_to_json(entity):
+ """
+ Hack to take advantage of the code-generated serialization code
+ """
ac = ApiClient()
return ac.sanitize_for_serialization(entity)
@@ -15,11 +18,12 @@ def entity_from_json(json_str, entity_type):
thing.data = json_str
return ac.deserialize(thing, entity_type)
-def release_elastic_dict(release):
+def release_to_elasticsearch(release):
"""
Converts from an entity model/schema to elasticsearch oriented schema.
Returns: dict
+ Raises exception on error (never returns None)
"""
if release.state != 'active':
diff --git a/python/fatcat_tools/workers/__init__.py b/python/fatcat_tools/workers/__init__.py
new file mode 100644
index 00000000..e8973bc3
--- /dev/null
+++ b/python/fatcat_tools/workers/__init__.py
@@ -0,0 +1,4 @@
+
+from .changelog import ChangelogWorker, EntityUpdatesWorker
+from .elasticsearch import ElasticsearchReleaseWorker
+from .worker_common import most_recent_message, FatcatWorker
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index d07b5988..e64c043b 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -1,11 +1,12 @@
import json
import time
-from fatcat_tools.workers.worker_common import FatcatWorker, most_recent_message
from pykafka.common import OffsetType
+from .worker_common import FatcatWorker, most_recent_message
-class FatcatChangelogWorker(FatcatWorker):
+
+class ChangelogWorker(FatcatWorker):
"""
Periodically polls the fatcat API looking for new changelogs. When they are
found, fetch them and push (as JSON) into a Kafka topic.
@@ -52,7 +53,7 @@ class FatcatChangelogWorker(FatcatWorker):
time.sleep(self.poll_interval)
-class FatcatEntityUpdatesWorker(FatcatWorker):
+class EntityUpdatesWorker(FatcatWorker):
"""
Consumes from the changelog topic and publishes expanded entities (fetched
from API) to update topics.
diff --git a/python/fatcat_tools/workers/elastic.py b/python/fatcat_tools/workers/elasticsearch.py
index 3a75a1b3..e7abd5ee 100644
--- a/python/fatcat_tools/workers/elastic.py
+++ b/python/fatcat_tools/workers/elasticsearch.py
@@ -2,14 +2,14 @@
import json
import time
import requests
-from fatcat_tools.transforms import release_elastic_dict
-from fatcat_tools.workers.worker_common import FatcatWorker
-from fatcat_client import ReleaseEntity
-from fatcat_tools.transforms import *
from pykafka.common import OffsetType
+from fatcat_client import ReleaseEntity
+from fatcat_tools import *
+from .worker_common import FatcatWorker
+
-class FatcatElasticReleaseWorker(FatcatWorker):
+class ElasticsearchReleaseWorker(FatcatWorker):
"""
Consumes from release-updates topic and pushes into (presumably local)
elasticsearch.
@@ -18,13 +18,13 @@ class FatcatElasticReleaseWorker(FatcatWorker):
"""
def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
- elastic_backend="http://localhost:9200", elastic_index="fatcat"):
+ elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):
super().__init__(kafka_hosts=kafka_hosts,
consume_topic=consume_topic,
api_host_url=None)
- self.consumer_group = "elastic-updates"
- self.elastic_backend = elastic_backend
- self.elastic_index = elastic_index
+ self.consumer_group = "elasticsearch-updates"
+ self.elasticsearch_backend = elasticsearch_backend
+ self.elasticsearch_index = elasticsearch_index
def run(self):
consume_topic = self.kafka.topics[self.consume_topic]
@@ -42,11 +42,11 @@ class FatcatElasticReleaseWorker(FatcatWorker):
json_str = msg.value.decode('utf-8')
release = entity_from_json(json_str, ReleaseEntity)
#print(release)
- elastic_endpoint = "{}/{}/release/{}".format(
- self.elastic_backend,
- self.elastic_index,
+ elasticsearch_endpoint = "{}/{}/release/{}".format(
+ self.elasticsearch_backend,
+ self.elasticsearch_index,
release.ident)
- print("Updating document: {}".format(elastic_endpoint))
- resp = requests.post(elastic_endpoint, json=release_elastic_dict(release))
+ print("Updating document: {}".format(elasticsearch_endpoint))
+ resp = requests.post(elasticsearch_endpoint, json=release_to_elasticsearch(release))
assert resp.status_code in (200, 201)
#consumer.commit_offsets()