From 7ebda2e051b51e49544ab75673b19ec5f27d9d45 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 12 Nov 2018 23:37:28 -0800 Subject: more python module refactoring --- python/fatcat_tools/changelog_workers.py | 2 +- python/fatcat_tools/crossref_importer.py | 2 +- python/fatcat_tools/elastic_workers.py | 4 +- python/fatcat_tools/entity_helpers.py | 100 ------------------------ python/fatcat_tools/grobid_metadata_importer.py | 2 +- python/fatcat_tools/issn_importer.py | 2 +- python/fatcat_tools/matched_importer.py | 2 +- python/fatcat_tools/orcid_importer.py | 2 +- python/fatcat_tools/transforms.py | 100 ++++++++++++++++++++++++ 9 files changed, 108 insertions(+), 108 deletions(-) delete mode 100644 python/fatcat_tools/entity_helpers.py create mode 100644 python/fatcat_tools/transforms.py (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/changelog_workers.py b/python/fatcat_tools/changelog_workers.py index e341ea32..223d4478 100644 --- a/python/fatcat_tools/changelog_workers.py +++ b/python/fatcat_tools/changelog_workers.py @@ -2,7 +2,7 @@ import json import time from itertools import islice -from fatcat.worker_common import FatcatWorker +from fatcat_tools.worker_common import FatcatWorker from pykafka.common import OffsetType diff --git a/python/fatcat_tools/crossref_importer.py b/python/fatcat_tools/crossref_importer.py index 37005965..6a5ad824 100644 --- a/python/fatcat_tools/crossref_importer.py +++ b/python/fatcat_tools/crossref_importer.py @@ -5,7 +5,7 @@ import sqlite3 import datetime import itertools import fatcat_client -from fatcat.importer_common import FatcatImporter +from fatcat_tools.importer_common import FatcatImporter class FatcatCrossrefImporter(FatcatImporter): diff --git a/python/fatcat_tools/elastic_workers.py b/python/fatcat_tools/elastic_workers.py index 3d2e9c39..eac8d6b0 100644 --- a/python/fatcat_tools/elastic_workers.py +++ b/python/fatcat_tools/elastic_workers.py @@ -2,9 +2,9 @@ import json import time import requests -from fatcat.worker_common import FatcatWorker +from fatcat_tools.worker_common import FatcatWorker from fatcat_client.models import ReleaseEntity -from fatcat.entity_helpers import * +from fatcat_tools.transforms import * from pykafka.common import OffsetType diff --git a/python/fatcat_tools/entity_helpers.py b/python/fatcat_tools/entity_helpers.py deleted file mode 100644 index c454536b..00000000 --- a/python/fatcat_tools/entity_helpers.py +++ /dev/null @@ -1,100 +0,0 @@ - -import collections -from fatcat_client.models import ReleaseEntity -from fatcat_client.api_client import ApiClient - -def entity_to_json(entity): - ac = ApiClient() - return ac.sanitize_for_serialization(entity) - -def entity_from_json(json_str, entity_type): - """ - Hack to take advantage of the code-generated deserialization code - """ - ac = ApiClient() - thing = collections.namedtuple('Thing', ['data']) - thing.data = json_str - return ac.deserialize(thing, entity_type) - -def release_elastic_dict(release): - """ - Converts from an entity model/schema to elasticsearch oriented schema. - - Returns: dict - """ - - if release.state != 'active': - raise ValueError("Entity is not 'active'") - - # First, the easy ones (direct copy) - t = dict( - ident = release.ident, - revision = release.revision, - title = release.title, - release_type = release.release_type, - release_status = release.release_status, - language = release.language, - doi = release.doi, - pmid = release.pmid, - pmcid = release.pmcid, - isbn13 = release.isbn13, - core_id = release.core_id, - wikidata_qid = release.wikidata_qid - ) - - if release.release_date: - # TODO: resolve why this can be either a string or datetime - if type(release.release_date) == str: - t['release_date'] = release.release_date - else: - t['release_date'] = release.release_date.strftime('%F') - - container = release.container - container_is_kept = False - if container: - t['publisher'] = container.publisher - t['container_name'] = container.name - t['container_issnl'] = container.issnl - container_extra = container.extra - if container_extra: - t['container_is_oa'] = container_extra.get('is_oa') - container_is_kept = container_extra.get('is_kept', False) - t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') - else: - t['publisher'] = release.publisher - - files = release.files or [] - t['file_count'] = len(files) - in_wa = False - in_ia = False - t['file_pdf_url'] = None - for f in files: - is_pdf = 'pdf' in f.get('mimetype', '') - for url in f.get('urls', []): - if url.get('rel', '') == 'webarchive': - in_wa = True - if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: - in_ia = True - if is_pdf: - t['file_pdf_url'] = url['url'] - if not t['file_pdf_url'] and is_pdf: - t['file_pdf_url'] = url['url'] - t['file_in_webarchive'] = in_wa - t['file_in_ia'] = in_ia - - extra = release.extra or dict() - if extra: - t['in_shadow'] = extra.get('in_shadow') - if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): - t['container_is_longtail_oa'] = True - t['any_abstract'] = bool(release.abstracts) - t['is_kept'] = container_is_kept or extra.get('is_kept', False) - - t['ref_count'] = len(release.refs or []) - t['contrib_count'] = len(release.contribs or []) - contrib_names = [] - for c in (release.contribs or []): - if c.raw_name: - contrib_names.append(c.raw_name) - t['contrib_names'] = contrib_names - return t diff --git a/python/fatcat_tools/grobid_metadata_importer.py b/python/fatcat_tools/grobid_metadata_importer.py index 95cc285e..effa0d94 100755 --- a/python/fatcat_tools/grobid_metadata_importer.py +++ b/python/fatcat_tools/grobid_metadata_importer.py @@ -5,7 +5,7 @@ import json import base64 import datetime import fatcat_client -from fatcat.importer_common import FatcatImporter +from fatcat_tools.importer_common import FatcatImporter MAX_ABSTRACT_BYTES=4096 diff --git a/python/fatcat_tools/issn_importer.py b/python/fatcat_tools/issn_importer.py index c9ef50b5..e3ed7382 100644 --- a/python/fatcat_tools/issn_importer.py +++ b/python/fatcat_tools/issn_importer.py @@ -3,7 +3,7 @@ import sys import json import itertools import fatcat_client -from fatcat.importer_common import FatcatImporter +from fatcat_tools.importer_common import FatcatImporter # CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): # ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count diff --git a/python/fatcat_tools/matched_importer.py b/python/fatcat_tools/matched_importer.py index 7f55369b..627ab6f1 100644 --- a/python/fatcat_tools/matched_importer.py +++ b/python/fatcat_tools/matched_importer.py @@ -4,7 +4,7 @@ import json import sqlite3 import itertools import fatcat_client -from fatcat.importer_common import FatcatImporter +from fatcat_tools.importer_common import FatcatImporter #row = row.split('\t') #assert len(row) == 2 diff --git a/python/fatcat_tools/orcid_importer.py b/python/fatcat_tools/orcid_importer.py index e1f5943c..f2366c66 100644 --- a/python/fatcat_tools/orcid_importer.py +++ b/python/fatcat_tools/orcid_importer.py @@ -3,7 +3,7 @@ import sys import json import itertools import fatcat_client -from fatcat.importer_common import FatcatImporter +from fatcat_tools.importer_common import FatcatImporter def value_or_none(e): if type(e) == dict: diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py new file mode 100644 index 00000000..c454536b --- /dev/null +++ b/python/fatcat_tools/transforms.py @@ -0,0 +1,100 @@ + +import collections +from fatcat_client.models import ReleaseEntity +from fatcat_client.api_client import ApiClient + +def entity_to_json(entity): + ac = ApiClient() + return ac.sanitize_for_serialization(entity) + +def entity_from_json(json_str, entity_type): + """ + Hack to take advantage of the code-generated deserialization code + """ + ac = ApiClient() + thing = collections.namedtuple('Thing', ['data']) + thing.data = json_str + return ac.deserialize(thing, entity_type) + +def release_elastic_dict(release): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + """ + + if release.state != 'active': + raise ValueError("Entity is not 'active'") + + # First, the easy ones (direct copy) + t = dict( + ident = release.ident, + revision = release.revision, + title = release.title, + release_type = release.release_type, + release_status = release.release_status, + language = release.language, + doi = release.doi, + pmid = release.pmid, + pmcid = release.pmcid, + isbn13 = release.isbn13, + core_id = release.core_id, + wikidata_qid = release.wikidata_qid + ) + + if release.release_date: + # TODO: resolve why this can be either a string or datetime + if type(release.release_date) == str: + t['release_date'] = release.release_date + else: + t['release_date'] = release.release_date.strftime('%F') + + container = release.container + container_is_kept = False + if container: + t['publisher'] = container.publisher + t['container_name'] = container.name + t['container_issnl'] = container.issnl + container_extra = container.extra + if container_extra: + t['container_is_oa'] = container_extra.get('is_oa') + container_is_kept = container_extra.get('is_kept', False) + t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + else: + t['publisher'] = release.publisher + + files = release.files or [] + t['file_count'] = len(files) + in_wa = False + in_ia = False + t['file_pdf_url'] = None + for f in files: + is_pdf = 'pdf' in f.get('mimetype', '') + for url in f.get('urls', []): + if url.get('rel', '') == 'webarchive': + in_wa = True + if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: + in_ia = True + if is_pdf: + t['file_pdf_url'] = url['url'] + if not t['file_pdf_url'] and is_pdf: + t['file_pdf_url'] = url['url'] + t['file_in_webarchive'] = in_wa + t['file_in_ia'] = in_ia + + extra = release.extra or dict() + if extra: + t['in_shadow'] = extra.get('in_shadow') + if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): + t['container_is_longtail_oa'] = True + t['any_abstract'] = bool(release.abstracts) + t['is_kept'] = container_is_kept or extra.get('is_kept', False) + + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + return t -- cgit v1.2.3