diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/__init__.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/fcid.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 7 | ||||
-rw-r--r-- | python/fatcat_tools/importers/common.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 12 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/issn.py | 14 | ||||
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 10 | ||||
-rw-r--r-- | python/fatcat_tools/importers/orcid.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/transforms.py | 6 | ||||
-rw-r--r-- | python/fatcat_tools/workers/__init__.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 7 | ||||
-rw-r--r-- | python/fatcat_tools/workers/elasticsearch.py (renamed from python/fatcat_tools/workers/elastic.py) | 28 |
13 files changed, 78 insertions, 30 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py new file mode 100644 index 00000000..0bb42ab5 --- /dev/null +++ b/python/fatcat_tools/__init__.py @@ -0,0 +1,3 @@ + +from .fcid import fcid2uuid, uuid2fcid +from .transforms import entity_to_json, entity_from_json, release_to_elasticsearch diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py index dd72b242..4194ea63 100644 --- a/python/fatcat_tools/fcid.py +++ b/python/fatcat_tools/fcid.py @@ -3,12 +3,18 @@ import base64 import uuid def fcid2uuid(s): + """ + Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object + """ s = s.split('_')[-1].upper().encode('utf-8') assert len(s) == 26 raw = base64.b32decode(s + b"======") return str(uuid.UUID(bytes=raw)).lower() def uuid2fcid(s): + """ + Converts a uuid.UUID object to a fatcat identifier (base32 encoded string) + """ raw = uuid.UUID(s).bytes return base64.b32encode(raw)[:26].lower().decode('utf-8') diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py new file mode 100644 index 00000000..0f5fafb6 --- /dev/null +++ b/python/fatcat_tools/importers/__init__.py @@ -0,0 +1,7 @@ + +from .common import FatcatImporter +from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP +from .grobid_metadata import GrobidMetadataImporter +from .issn import IssnImporter +from .matched import MatchedImporter +from .orcid import OrcidImporter diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index d289171d..9cf92b41 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -16,6 +16,9 @@ def grouper(iterable, n, fillvalue=None): return itertools.zip_longest(*args, fillvalue=fillvalue) class FatcatImporter: + """ + Base class for fatcat importers + """ def __init__(self, host_url, issn_map_file=None): conf = fatcat_client.Configuration() diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fe80c2d3..fac8f32b 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -5,9 +5,11 @@ import sqlite3 import datetime import itertools import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter +# The docs/guide should be the cannonical home for these mappings; update there +# first CROSSREF_TYPE_MAP = { 'book': 'book', 'book-chapter': 'chapter', @@ -29,8 +31,14 @@ CROSSREF_TYPE_MAP = { 'standard': 'standard', } +class CrossrefImporter(FatcatImporter): + """ + Importer for Crossref metadata. -class FatcatCrossrefImporter(FatcatImporter): + Can use a local sqlite3 file for faster "external identifier" lookups + + See https://github.com/CrossRef/rest-api-doc for JSON schema notes + """ def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True): super().__init__(host_url, issn_map_file) diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index dedc9728..ba8a4e6f 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -5,12 +5,12 @@ import json import base64 import datetime import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter MAX_ABSTRACT_BYTES=4096 -class FatcatGrobidMetadataImporter(FatcatImporter): +class GrobidMetadataImporter(FatcatImporter): def __init__(self, host_url, default_link_rel="web"): super().__init__(host_url) diff --git a/python/fatcat_tools/importers/issn.py b/python/fatcat_tools/importers/issn.py index ba8492c6..0b0efccb 100644 --- a/python/fatcat_tools/importers/issn.py +++ b/python/fatcat_tools/importers/issn.py @@ -3,10 +3,8 @@ import sys import json import itertools import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter -# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): -# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count def or_none(s): if s is None: @@ -26,7 +24,15 @@ def truthy(s): else: return None -class FatcatIssnImporter(FatcatImporter): +class IssnImporter(FatcatImporter): + """ + Imports journal metadata ("containers") by ISSN, currently from a custom + (data munged) .csv file format + + CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): + + ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + """ def parse_issn_row(self, row): """ diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 774019c7..732fccbe 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -4,7 +4,7 @@ import json import sqlite3 import itertools import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter #row = row.split('\t') #assert len(row) == 2 @@ -13,8 +13,14 @@ from fatcat_tools.importers.common import FatcatImporter #print(sha1) #dois = [d.lower() for d in json.loads(row[1])] -class FatcatMatchedImporter(FatcatImporter): +class MatchedImporter(FatcatImporter): """ + Importer for "file to crossref DOI" matches. + + These matches are currently generated by Internet Archive hadoop jobs + written in scala (part of the 'sandcrawler' repo/project), but could be + generated by other parties as well. + Input format is JSON with keys: - dois (list) - sha1 (hex) diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index 527316dd..9e4767f9 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -3,7 +3,7 @@ import sys import json import itertools import fatcat_client -from fatcat_tools.importers.common import FatcatImporter +from .common import FatcatImporter def value_or_none(e): if type(e) == dict: @@ -20,7 +20,7 @@ def value_or_none(e): return None return e -class FatcatOrcidImporter(FatcatImporter): +class OrcidImporter(FatcatImporter): def parse_orcid_dict(self, obj): """ diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index e10c6ba5..147acd7c 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -3,6 +3,9 @@ import collections from fatcat_client import ReleaseEntity, ApiClient def entity_to_json(entity): + """ + Hack to take advantage of the code-generated serialization code + """ ac = ApiClient() return ac.sanitize_for_serialization(entity) @@ -15,11 +18,12 @@ def entity_from_json(json_str, entity_type): thing.data = json_str return ac.deserialize(thing, entity_type) -def release_elastic_dict(release): +def release_to_elasticsearch(release): """ Converts from an entity model/schema to elasticsearch oriented schema. Returns: dict + Raises exception on error (never returns None) """ if release.state != 'active': diff --git a/python/fatcat_tools/workers/__init__.py b/python/fatcat_tools/workers/__init__.py new file mode 100644 index 00000000..e8973bc3 --- /dev/null +++ b/python/fatcat_tools/workers/__init__.py @@ -0,0 +1,4 @@ + +from .changelog import ChangelogWorker, EntityUpdatesWorker +from .elasticsearch import ElasticsearchReleaseWorker +from .worker_common import most_recent_message, FatcatWorker diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index d07b5988..e64c043b 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -1,11 +1,12 @@ import json import time -from fatcat_tools.workers.worker_common import FatcatWorker, most_recent_message from pykafka.common import OffsetType +from .worker_common import FatcatWorker, most_recent_message -class FatcatChangelogWorker(FatcatWorker): + +class ChangelogWorker(FatcatWorker): """ Periodically polls the fatcat API looking for new changelogs. When they are found, fetch them and push (as JSON) into a Kafka topic. @@ -52,7 +53,7 @@ class FatcatChangelogWorker(FatcatWorker): time.sleep(self.poll_interval) -class FatcatEntityUpdatesWorker(FatcatWorker): +class EntityUpdatesWorker(FatcatWorker): """ Consumes from the changelog topic and publishes expanded entities (fetched from API) to update topics. diff --git a/python/fatcat_tools/workers/elastic.py b/python/fatcat_tools/workers/elasticsearch.py index 3a75a1b3..e7abd5ee 100644 --- a/python/fatcat_tools/workers/elastic.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -2,14 +2,14 @@ import json import time import requests -from fatcat_tools.transforms import release_elastic_dict -from fatcat_tools.workers.worker_common import FatcatWorker -from fatcat_client import ReleaseEntity -from fatcat_tools.transforms import * from pykafka.common import OffsetType +from fatcat_client import ReleaseEntity +from fatcat_tools import * +from .worker_common import FatcatWorker + -class FatcatElasticReleaseWorker(FatcatWorker): +class ElasticsearchReleaseWorker(FatcatWorker): """ Consumes from release-updates topic and pushes into (presumably local) elasticsearch. @@ -18,13 +18,13 @@ class FatcatElasticReleaseWorker(FatcatWorker): """ def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, - elastic_backend="http://localhost:9200", elastic_index="fatcat"): + elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"): super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic, api_host_url=None) - self.consumer_group = "elastic-updates" - self.elastic_backend = elastic_backend - self.elastic_index = elastic_index + self.consumer_group = "elasticsearch-updates" + self.elasticsearch_backend = elasticsearch_backend + self.elasticsearch_index = elasticsearch_index def run(self): consume_topic = self.kafka.topics[self.consume_topic] @@ -42,11 +42,11 @@ class FatcatElasticReleaseWorker(FatcatWorker): json_str = msg.value.decode('utf-8') release = entity_from_json(json_str, ReleaseEntity) #print(release) - elastic_endpoint = "{}/{}/release/{}".format( - self.elastic_backend, - self.elastic_index, + elasticsearch_endpoint = "{}/{}/release/{}".format( + self.elasticsearch_backend, + self.elasticsearch_index, release.ident) - print("Updating document: {}".format(elastic_endpoint)) - resp = requests.post(elastic_endpoint, json=release_elastic_dict(release)) + print("Updating document: {}".format(elasticsearch_endpoint)) + resp = requests.post(elasticsearch_endpoint, json=release_to_elasticsearch(release)) assert resp.status_code in (200, 201) #consumer.commit_offsets() |