From 655f7060eb5b5e711a8a892cb1085639c4aa8fd2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 11 Mar 2019 16:38:51 -0700 Subject: refactor transforms into sub-dir --- python/fatcat_tools/__init__.py | 4 +- python/fatcat_tools/transforms.py | 517 ------------------------ python/fatcat_tools/transforms/__init__.py | 4 + python/fatcat_tools/transforms/csl.py | 170 ++++++++ python/fatcat_tools/transforms/elasticsearch.py | 327 +++++++++++++++ python/fatcat_tools/transforms/entities.py | 31 ++ 6 files changed, 533 insertions(+), 520 deletions(-) delete mode 100644 python/fatcat_tools/transforms.py create mode 100644 python/fatcat_tools/transforms/__init__.py create mode 100644 python/fatcat_tools/transforms/csl.py create mode 100644 python/fatcat_tools/transforms/elasticsearch.py create mode 100644 python/fatcat_tools/transforms/entities.py (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index c72ccd47..f2798f0b 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -1,6 +1,4 @@ from .api_auth import authenticated_api, public_api from .fcid import fcid2uuid, uuid2fcid -from .transforms import entity_to_dict, entity_from_json, \ - release_to_elasticsearch, container_to_elasticsearch, \ - changelog_to_elasticsearch, release_to_csl +from .transforms import * diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py deleted file mode 100644 index f49b5ac9..00000000 --- a/python/fatcat_tools/transforms.py +++ /dev/null @@ -1,517 +0,0 @@ - - -import collections -from fatcat_client import ApiClient - -def entity_to_dict(entity, api_client=None): - """ - Hack to take advantage of the code-generated serialization code. - - Initializing/destroying ApiClient objects is surprisingly expensive - (because it involves a threadpool), so we allow passing an existing - instance. If you already have a full-on API connection `api`, you can - access the ApiClient object as `api.api_client`. This is such a speed-up - that this argument may become mandatory. - """ - if not api_client: - api_client = ApiClient() - return api_client.sanitize_for_serialization(entity) - -def entity_from_json(json_str, entity_type, api_client=None): - """ - Hack to take advantage of the code-generated deserialization code - - See not on `entity_to_dict()` about api_client argument. - """ - if not api_client: - api_client = ApiClient() - thing = collections.namedtuple('Thing', ['data']) - thing.data = json_str - return api_client.deserialize(thing, entity_type) - -def check_kbart(year, archive): - if not archive or not archive.get('year_spans'): - return None - for span in archive['year_spans']: - if year >= span[0] and year <= span[1]: - return True - return False - -def test_check_kbart(): - - assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) == False - assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) == True - assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False - assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True - -def release_to_elasticsearch(entity, force_bool=True): - """ - Converts from an entity model/schema to elasticsearch oriented schema. - - Returns: dict - Raises exception on error (never returns None) - """ - - if entity.state in ('redirect', 'deleted'): - return dict( - ident = entity.ident, - state = entity.state, - ) - elif entity.state != 'active': - raise ValueError("Unhandled entity state: {}".format(entity.state)) - - # First, the easy ones (direct copy) - release = entity - t = dict( - ident = release.ident, - state = release.state, - revision = release.revision, - title = release.title, - original_title = release.original_title, - release_type = release.release_type, - release_status = release.release_status, - language = release.language, - license = release.license_slug, - doi = release.doi, - pmid = release.pmid, - pmcid = release.pmcid, - isbn13 = release.isbn13, - wikidata_qid = release.wikidata_qid, - core_id = release.core_id, - arxiv_id = release.core_id, - jstor_id = release.jstor_id, - ) - - is_oa = None - is_preserved = None - is_longtail_oa = None - in_kbart = None - in_jstor = False - in_web = False - in_dweb = False - in_ia = False - in_ia_sim = False - in_shadow = False - - release_year = release.release_year - if release.release_date: - # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) - t['release_date'] = release.release_date.isoformat() - if not release_year: - release_year = release.release_date.year - if release_year: - t['release_year'] = release_year - - t['any_abstract'] = len(release.abstracts) > 0 - t['ref_count'] = len(release.refs or []) - t['contrib_count'] = len(release.contribs or []) - contrib_names = [] - for c in (release.contribs or []): - if c.raw_name: - contrib_names.append(c.raw_name) - t['contrib_names'] = contrib_names - - container = release.container - if container: - t['publisher'] = container.publisher - t['container_name'] = container.name - t['container_id'] = container.ident - t['container_issnl'] = container.issnl - t['container_type'] = container.container_type - if container.extra: - c_extra = container.extra - if c_extra.get('kbart') and release_year: - in_jstor = check_kbart(release_year, c_extra['kbart'].get('jstor')) - in_kbart = in_jstor - for archive in ('portico', 'lockss', 'clockss'): - in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive)) - - if c_extra.get('ia'): - if c_extra['ia'].get('sim') and release_year: - in_ia_sim = check_kbart(release_year, c_extra['ia']['sim']) - if c_extra['ia'].get('longtail_oa'): - is_longtail_oa = True - if c_extra.get('sherpa_romeo'): - if c_extra['sherpa_romeo'].get('color') == 'white': - is_oa = False - if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): - is_oa = True - if c_extra.get('doaj'): - if c_extra['doaj'].get('as_of'): - is_oa = True - if c_extra.get('road'): - if c_extra['road'].get('as_of'): - is_oa = True - else: - t['publisher'] = release.publisher - - if release.jstor_id or (release.doi and release.doi.startswith('10.2307/')): - in_jstor = True - - files = release.files or [] - t['file_count'] = len(files) - t['fileset_count'] = len(release.filesets or []) - t['webcapture_count'] = len(release.webcaptures or []) - any_pdf_url = None - good_pdf_url = None - best_pdf_url = None - ia_pdf_url = None - for f in files: - if f.extra and f.extra.get('shadows'): - # TODO: shadow check goes here - in_shadows = True - is_pdf = 'pdf' in (f.mimetype or '') - for url in (f.urls or []): - if url.url.lower().startswith('http'): - in_web = True - if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): - # not sure what rel will be for this stuff - in_dweb = True - if is_pdf: - any_pdf_url = url.url - if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: - is_preserved = True - good_pdf_url = url.url - if '//www.jstor.org/' in url.url: - in_jstor = True - if '//web.archive.org/' in url.url or '//archive.org/' in url.url: - in_ia = True - if is_pdf: - best_pdf_url = url.url - ia_pdf_url = url.url - # here is where we bake-in priority; IA-specific - t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url - t['ia_pdf_url'] = ia_pdf_url - - if release.license_slug: - # TODO: more/better checks here, particularly strict *not* OA licenses - if release.license_slug.startswith("CC-"): - is_oa = True - - extra = release.extra or dict() - if extra: - if extra.get('is_oa'): - # NOTE: not actually setting this anywhere... but could - is_oa = True - if extra.get('longtail_oa'): - # sometimes set by GROBID/matcher - is_oa = True - is_longtail_oa = True - if not t.get('container_name'): - t['container_name'] = extra.get('container_name') - if extra.get('crossref'): - if extra['crossref'].get('archive'): - # all crossref archives are KBART, I believe - in_kbart = True - - if is_longtail_oa: - is_oa = True - - if force_bool: - t['is_oa'] = bool(is_oa) - t['is_longtail_oa'] = bool(is_longtail_oa) - t['in_kbart'] = bool(in_kbart) - t['in_ia_sim'] = bool(in_ia_sim) - t['in_jstor'] = bool(in_jstor) - t['in_web'] = bool(in_web) - t['in_dweb'] = bool(in_dweb) - else: - t['is_oa'] = is_oa - t['is_longtail_oa'] = is_longtail_oa - t['in_kbart'] = in_kbart - t['in_ia_sim'] = in_ia_sim - t['in_jstor'] = in_jstor - t['in_web'] = in_web - t['in_dweb'] = in_dweb - - t['in_ia'] = bool(in_ia) - t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) - return t - -def container_to_elasticsearch(entity, force_bool=True): - """ - Converts from an entity model/schema to elasticsearch oriented schema. - - Returns: dict - Raises exception on error (never returns None) - """ - - if entity.state in ('redirect', 'deleted'): - return dict( - ident = entity.ident, - state = entity.state, - ) - elif entity.state != 'active': - raise ValueError("Unhandled entity state: {}".format(entity.state)) - - # First, the easy ones (direct copy) - t = dict( - ident = entity.ident, - state = entity.state, - revision = entity.revision, - - name = entity.name, - publisher = entity.publisher, - container_type = entity.container_type, - issnl = entity.issnl, - wikidata_qid = entity.wikidata_qid, - ) - - # TODO: region, discipline - # TODO: single primary language? - if not entity.extra: - entity.extra = dict() - for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'): - if entity.extra.get(key): - t[key] = entity.extra[key] - - in_doaj = None - in_road = None - # TODO: not currently implemented - in_doi = None - # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid" - #in_doaj_works = None - in_sherpa_romeo = None - is_oa = None - # TODO: not actually set/stored anywhere? - is_longtail_oa = None - any_kbart = None - any_jstor = None - any_ia_sim = None - - extra = entity.extra - if extra.get('doaj'): - if extra['doaj'].get('as_of'): - in_doaj = True - if extra.get('road'): - if extra['road'].get('as_of'): - in_road = True - if extra.get('default_license'): - if extra['default_license'].startswith('CC-'): - is_oa = True - if extra.get('sherpa_romeo'): - in_sherpa_romeo = True - if extra['sherpa_romeo'].get('color') == 'white': - is_oa = False - if extra.get('kbart'): - any_kbart = True - if extra['kbart'].get('jstor'): - any_jstor = True - if extra.get('ia'): - if extra['ia'].get('sim'): - any_ia_sim = True - - t['in_doaj'] = bool(in_doaj) - t['in_road'] = bool(in_road) - t['in_sherpa_romeo'] = bool(in_sherpa_romeo) - t['any_kbart'] = bool(any_kbart) - t['is_longtail_oa'] = bool(is_longtail_oa) - if force_bool: - t['in_doi'] = bool(in_doi) - t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa) - t['any_jstor'] = bool(any_jstor) - t['any_ia_sim'] = bool(any_ia_sim) - else: - t['in_doi'] = in_doi - t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa - t['any_jstor'] = any_jstor - t['any_ia_sim'] = any_ia_sim - return t - - -def contribs_by_role(contribs, role): - ret = [c.copy() for c in contribs if c['role'] == role] - [c.pop('role') for c in ret] - # XXX: - [c.pop('literal') for c in ret] - if not ret: - return None - else: - return ret - - -def release_to_csl(entity): - """ - Returns a python dict which can be json.dumps() to get a CSL-JSON (aka, - citeproc-JSON, aka Citation Style Language JSON) - - This function will likely become an API method/endpoint - - Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json - """ - contribs = [] - for contrib in (entity.contribs or []): - if contrib.creator: - # TODO: should we actually be pulling creator metadata? or just - # using release-local raw metadata? - c = dict( - family=contrib.creator.surname, - given=contrib.creator.given_name, - #dropping-particle - #non-dropping-particle - #suffix - #comma-suffix - #static-ordering - literal=contrib.raw_name, # or display_name? - #parse-names, - role=contrib.role, - ) - else: - c = dict( - # XXX: possible inclusion of full name metadata in release_contrib - family=contrib.raw_name.split()[-1], - literal=contrib.raw_name, - role=contrib.role, - ) - for k in list(c.keys()): - if not c[k]: - c.pop(k) - contribs.append(c) - abstract = None - if entity.abstracts: - abstract = entity.abstracts[0].content - - issued_date = None - if entity.release_date: - issued_date = {"date-parts": [[ - entity.release_date.year, - entity.release_date.month, - entity.release_date.day, - ]]} - elif entity.release_year: - issued_date = {"date-parts": [[entity.release_year]]} - - csl = dict( - #id, - #categories - type=entity.release_type or "article", # XXX: can't be blank - language=entity.language, - #journalAbbreviation - #shortTitle - ## see below for all contrib roles - #accessed - #container - #event-date - issued=issued_date, - #original-date - #submitted - abstract=abstract, - #annote - #archive - #archive_location - #archive-place - #authority - #call-number - #chapter-number - #citation-number - #citation-label - #collection-number - #collection-title - container_title=entity.container and entity.container.name, - #container-title-short - #dimensions - DOI=entity.doi, - #edition - #event - #event-place - #first-reference-note-number - #genre - ISBN=entity.isbn13, - ISSN=entity.container and entity.container.issnl, - issue=entity.issue, - #jurisdiction - #keyword - #locator - #medium - #note - #number - #number-of-pages - #number-of-volumes - #original-publisher - #original-publisher-place - #original-title - # XXX: page=entity.pages, - page_first=entity.pages.split('-')[0], - PMCID=entity.pmcid, - PMID=entity.pmid, - publisher=(entity.container and entity.container.publisher) or entity.publisher, - #publisher-place - #references - #reviewed-title - #scale - #section - #source - #status - title=entity.title, - #title-short - #URL - #version - volume=entity.volume, - #year-suffix - ) - for role in ['author', 'collection-editor', 'composer', 'container-author', - 'director', 'editor', 'editorial-director', 'interviewer', - 'illustrator', 'original-author', 'recipient', 'reviewed-author', - 'translator']: - cbr = contribs_by_role(contribs, role) - if cbr: - csl[role] = cbr - # underline-to-dash - csl['container-title'] = csl.pop('container_title') - csl['page-first'] = csl.pop('page_first') - empty_keys = [k for k,v in csl.items() if not v] - for k in empty_keys: - csl.pop(k) - return csl - - -def refs_to_csl(entity): - ret = [] - for ref in entity.refs: - if ref.release_id and False: - # TODO: fetch full entity from API and convert with release_to_csl - raise NotImplementedError - else: - issued_date = None - if ref.year: - issued_date = [[ref.year]] - csl = dict( - title=ref.title, - issued=issued_date, - ) - csl['id'] = ref.key or ref.index, # zero- or one-indexed? - ret.append(csl) - return ret - - -def changelog_to_elasticsearch(entity): - - editgroup = entity.editgroup - t = dict( - index=entity.index, - editgroup_id=entity.editgroup_id, - timestamp=entity.timestamp, - editor_id=editgroup.editor_id, - ) - - extra = editgroup.extra or dict() - if extra.get('agent'): - t['agent'] = extra['agent'] - - t['containers'] = len(editgroup.edits.containers) - t['creators'] = len(editgroup.edits.containers) - t['files'] = len(editgroup.edits.containers) - t['filesets'] = len(editgroup.edits.containers) - t['webcaptures'] = len(editgroup.edits.containers) - t['releases'] = len(editgroup.edits.containers) - t['works'] = len(editgroup.edits.containers) - - # TODO: parse and pull out counts - #created = 0 - #updated = 0 - #deleted = 0 - #t['created'] = created - #t['updated'] = updated - #t['deleted'] = deleted - #t['total'] = created + updated + deleted - return t diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py new file mode 100644 index 00000000..4950433b --- /dev/null +++ b/python/fatcat_tools/transforms/__init__.py @@ -0,0 +1,4 @@ + +from .entities import entity_to_dict, entity_from_json +from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch +from .csl import release_to_csl diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py new file mode 100644 index 00000000..f9615b26 --- /dev/null +++ b/python/fatcat_tools/transforms/csl.py @@ -0,0 +1,170 @@ + + +import collections +from fatcat_client import ApiClient + + +def contribs_by_role(contribs, role): + ret = [c.copy() for c in contribs if c['role'] == role] + [c.pop('role') for c in ret] + # XXX: + [c.pop('literal') for c in ret] + if not ret: + return None + else: + return ret + + +def release_to_csl(entity): + """ + Returns a python dict which can be json.dumps() to get a CSL-JSON (aka, + citeproc-JSON, aka Citation Style Language JSON) + + This function will likely become an API method/endpoint + + Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json + """ + contribs = [] + for contrib in (entity.contribs or []): + if contrib.creator: + # TODO: should we actually be pulling creator metadata? or just + # using release-local raw metadata? + c = dict( + family=contrib.creator.surname, + given=contrib.creator.given_name, + #dropping-particle + #non-dropping-particle + #suffix + #comma-suffix + #static-ordering + literal=contrib.raw_name, # or display_name? + #parse-names, + role=contrib.role, + ) + else: + c = dict( + # XXX: possible inclusion of full name metadata in release_contrib + family=contrib.raw_name.split()[-1], + literal=contrib.raw_name, + role=contrib.role, + ) + for k in list(c.keys()): + if not c[k]: + c.pop(k) + contribs.append(c) + abstract = None + if entity.abstracts: + abstract = entity.abstracts[0].content + + issued_date = None + if entity.release_date: + issued_date = {"date-parts": [[ + entity.release_date.year, + entity.release_date.month, + entity.release_date.day, + ]]} + elif entity.release_year: + issued_date = {"date-parts": [[entity.release_year]]} + + csl = dict( + #id, + #categories + type=entity.release_type or "article", # XXX: can't be blank + language=entity.language, + #journalAbbreviation + #shortTitle + ## see below for all contrib roles + #accessed + #container + #event-date + issued=issued_date, + #original-date + #submitted + abstract=abstract, + #annote + #archive + #archive_location + #archive-place + #authority + #call-number + #chapter-number + #citation-number + #citation-label + #collection-number + #collection-title + container_title=entity.container and entity.container.name, + #container-title-short + #dimensions + DOI=entity.doi, + #edition + #event + #event-place + #first-reference-note-number + #genre + ISBN=entity.isbn13, + ISSN=entity.container and entity.container.issnl, + issue=entity.issue, + #jurisdiction + #keyword + #locator + #medium + #note + #number + #number-of-pages + #number-of-volumes + #original-publisher + #original-publisher-place + #original-title + # XXX: page=entity.pages, + page_first=entity.pages.split('-')[0], + PMCID=entity.pmcid, + PMID=entity.pmid, + publisher=(entity.container and entity.container.publisher) or entity.publisher, + #publisher-place + #references + #reviewed-title + #scale + #section + #source + #status + title=entity.title, + #title-short + #URL + #version + volume=entity.volume, + #year-suffix + ) + for role in ['author', 'collection-editor', 'composer', 'container-author', + 'director', 'editor', 'editorial-director', 'interviewer', + 'illustrator', 'original-author', 'recipient', 'reviewed-author', + 'translator']: + cbr = contribs_by_role(contribs, role) + if cbr: + csl[role] = cbr + # underline-to-dash + csl['container-title'] = csl.pop('container_title') + csl['page-first'] = csl.pop('page_first') + empty_keys = [k for k,v in csl.items() if not v] + for k in empty_keys: + csl.pop(k) + return csl + + +def refs_to_csl(entity): + ret = [] + for ref in entity.refs: + if ref.release_id and False: + # TODO: fetch full entity from API and convert with release_to_csl + raise NotImplementedError + else: + issued_date = None + if ref.year: + issued_date = [[ref.year]] + csl = dict( + title=ref.title, + issued=issued_date, + ) + csl['id'] = ref.key or ref.index, # zero- or one-indexed? + ret.append(csl) + return ret + diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py new file mode 100644 index 00000000..0c2c5e46 --- /dev/null +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -0,0 +1,327 @@ + + +import collections +from fatcat_client import ApiClient + + +def check_kbart(year, archive): + if not archive or not archive.get('year_spans'): + return None + for span in archive['year_spans']: + if year >= span[0] and year <= span[1]: + return True + return False + +def test_check_kbart(): + + assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) == False + assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) == True + assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False + assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True + +def release_to_elasticsearch(entity, force_bool=True): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + Raises exception on error (never returns None) + """ + + if entity.state in ('redirect', 'deleted'): + return dict( + ident = entity.ident, + state = entity.state, + ) + elif entity.state != 'active': + raise ValueError("Unhandled entity state: {}".format(entity.state)) + + # First, the easy ones (direct copy) + release = entity + t = dict( + ident = release.ident, + state = release.state, + revision = release.revision, + title = release.title, + original_title = release.original_title, + release_type = release.release_type, + release_status = release.release_status, + language = release.language, + license = release.license_slug, + doi = release.doi, + pmid = release.pmid, + pmcid = release.pmcid, + isbn13 = release.isbn13, + wikidata_qid = release.wikidata_qid, + core_id = release.core_id, + arxiv_id = release.core_id, + jstor_id = release.jstor_id, + ) + + is_oa = None + is_preserved = None + is_longtail_oa = None + in_kbart = None + in_jstor = False + in_web = False + in_dweb = False + in_ia = False + in_ia_sim = False + in_shadow = False + + release_year = release.release_year + if release.release_date: + # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) + t['release_date'] = release.release_date.isoformat() + if not release_year: + release_year = release.release_date.year + if release_year: + t['release_year'] = release_year + + t['any_abstract'] = len(release.abstracts) > 0 + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + + container = release.container + if container: + t['publisher'] = container.publisher + t['container_name'] = container.name + t['container_id'] = container.ident + t['container_issnl'] = container.issnl + t['container_type'] = container.container_type + if container.extra: + c_extra = container.extra + if c_extra.get('kbart') and release_year: + in_jstor = check_kbart(release_year, c_extra['kbart'].get('jstor')) + in_kbart = in_jstor + for archive in ('portico', 'lockss', 'clockss'): + in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive)) + + if c_extra.get('ia'): + if c_extra['ia'].get('sim') and release_year: + in_ia_sim = check_kbart(release_year, c_extra['ia']['sim']) + if c_extra['ia'].get('longtail_oa'): + is_longtail_oa = True + if c_extra.get('sherpa_romeo'): + if c_extra['sherpa_romeo'].get('color') == 'white': + is_oa = False + if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'): + is_oa = True + if c_extra.get('doaj'): + if c_extra['doaj'].get('as_of'): + is_oa = True + if c_extra.get('road'): + if c_extra['road'].get('as_of'): + is_oa = True + else: + t['publisher'] = release.publisher + + if release.jstor_id or (release.doi and release.doi.startswith('10.2307/')): + in_jstor = True + + files = release.files or [] + t['file_count'] = len(files) + t['fileset_count'] = len(release.filesets or []) + t['webcapture_count'] = len(release.webcaptures or []) + any_pdf_url = None + good_pdf_url = None + best_pdf_url = None + ia_pdf_url = None + for f in files: + if f.extra and f.extra.get('shadows'): + # TODO: shadow check goes here + in_shadows = True + is_pdf = 'pdf' in (f.mimetype or '') + for url in (f.urls or []): + if url.url.lower().startswith('http'): + in_web = True + if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): + # not sure what rel will be for this stuff + in_dweb = True + if is_pdf: + any_pdf_url = url.url + if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: + is_preserved = True + good_pdf_url = url.url + if '//www.jstor.org/' in url.url: + in_jstor = True + if '//web.archive.org/' in url.url or '//archive.org/' in url.url: + in_ia = True + if is_pdf: + best_pdf_url = url.url + ia_pdf_url = url.url + # here is where we bake-in priority; IA-specific + t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url + t['ia_pdf_url'] = ia_pdf_url + + if release.license_slug: + # TODO: more/better checks here, particularly strict *not* OA licenses + if release.license_slug.startswith("CC-"): + is_oa = True + + extra = release.extra or dict() + if extra: + if extra.get('is_oa'): + # NOTE: not actually setting this anywhere... but could + is_oa = True + if extra.get('longtail_oa'): + # sometimes set by GROBID/matcher + is_oa = True + is_longtail_oa = True + if not t.get('container_name'): + t['container_name'] = extra.get('container_name') + if extra.get('crossref'): + if extra['crossref'].get('archive'): + # all crossref archives are KBART, I believe + in_kbart = True + + if is_longtail_oa: + is_oa = True + + if force_bool: + t['is_oa'] = bool(is_oa) + t['is_longtail_oa'] = bool(is_longtail_oa) + t['in_kbart'] = bool(in_kbart) + t['in_ia_sim'] = bool(in_ia_sim) + t['in_jstor'] = bool(in_jstor) + t['in_web'] = bool(in_web) + t['in_dweb'] = bool(in_dweb) + else: + t['is_oa'] = is_oa + t['is_longtail_oa'] = is_longtail_oa + t['in_kbart'] = in_kbart + t['in_ia_sim'] = in_ia_sim + t['in_jstor'] = in_jstor + t['in_web'] = in_web + t['in_dweb'] = in_dweb + + t['in_ia'] = bool(in_ia) + t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) + return t + +def container_to_elasticsearch(entity, force_bool=True): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + Raises exception on error (never returns None) + """ + + if entity.state in ('redirect', 'deleted'): + return dict( + ident = entity.ident, + state = entity.state, + ) + elif entity.state != 'active': + raise ValueError("Unhandled entity state: {}".format(entity.state)) + + # First, the easy ones (direct copy) + t = dict( + ident = entity.ident, + state = entity.state, + revision = entity.revision, + + name = entity.name, + publisher = entity.publisher, + container_type = entity.container_type, + issnl = entity.issnl, + wikidata_qid = entity.wikidata_qid, + ) + + # TODO: region, discipline + # TODO: single primary language? + if not entity.extra: + entity.extra = dict() + for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'): + if entity.extra.get(key): + t[key] = entity.extra[key] + + in_doaj = None + in_road = None + # TODO: not currently implemented + in_doi = None + # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid" + #in_doaj_works = None + in_sherpa_romeo = None + is_oa = None + # TODO: not actually set/stored anywhere? + is_longtail_oa = None + any_kbart = None + any_jstor = None + any_ia_sim = None + + extra = entity.extra + if extra.get('doaj'): + if extra['doaj'].get('as_of'): + in_doaj = True + if extra.get('road'): + if extra['road'].get('as_of'): + in_road = True + if extra.get('default_license'): + if extra['default_license'].startswith('CC-'): + is_oa = True + if extra.get('sherpa_romeo'): + in_sherpa_romeo = True + if extra['sherpa_romeo'].get('color') == 'white': + is_oa = False + if extra.get('kbart'): + any_kbart = True + if extra['kbart'].get('jstor'): + any_jstor = True + if extra.get('ia'): + if extra['ia'].get('sim'): + any_ia_sim = True + + t['in_doaj'] = bool(in_doaj) + t['in_road'] = bool(in_road) + t['in_sherpa_romeo'] = bool(in_sherpa_romeo) + t['any_kbart'] = bool(any_kbart) + t['is_longtail_oa'] = bool(is_longtail_oa) + if force_bool: + t['in_doi'] = bool(in_doi) + t['is_oa'] = bool(in_doaj or in_road or is_longtail_oa or is_oa) + t['any_jstor'] = bool(any_jstor) + t['any_ia_sim'] = bool(any_ia_sim) + else: + t['in_doi'] = in_doi + t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa + t['any_jstor'] = any_jstor + t['any_ia_sim'] = any_ia_sim + return t + + +def changelog_to_elasticsearch(entity): + + editgroup = entity.editgroup + t = dict( + index=entity.index, + editgroup_id=entity.editgroup_id, + timestamp=entity.timestamp, + editor_id=editgroup.editor_id, + ) + + extra = editgroup.extra or dict() + if extra.get('agent'): + t['agent'] = extra['agent'] + + t['containers'] = len(editgroup.edits.containers) + t['creators'] = len(editgroup.edits.containers) + t['files'] = len(editgroup.edits.containers) + t['filesets'] = len(editgroup.edits.containers) + t['webcaptures'] = len(editgroup.edits.containers) + t['releases'] = len(editgroup.edits.containers) + t['works'] = len(editgroup.edits.containers) + + # TODO: parse and pull out counts + #created = 0 + #updated = 0 + #deleted = 0 + #t['created'] = created + #t['updated'] = updated + #t['deleted'] = deleted + #t['total'] = created + updated + deleted + return t diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py new file mode 100644 index 00000000..b67df12d --- /dev/null +++ b/python/fatcat_tools/transforms/entities.py @@ -0,0 +1,31 @@ + + +import collections +from fatcat_client import ApiClient + +def entity_to_dict(entity, api_client=None): + """ + Hack to take advantage of the code-generated serialization code. + + Initializing/destroying ApiClient objects is surprisingly expensive + (because it involves a threadpool), so we allow passing an existing + instance. If you already have a full-on API connection `api`, you can + access the ApiClient object as `api.api_client`. This is such a speed-up + that this argument may become mandatory. + """ + if not api_client: + api_client = ApiClient() + return api_client.sanitize_for_serialization(entity) + +def entity_from_json(json_str, entity_type, api_client=None): + """ + Hack to take advantage of the code-generated deserialization code + + See not on `entity_to_dict()` about api_client argument. + """ + if not api_client: + api_client = ApiClient() + thing = collections.namedtuple('Thing', ['data']) + thing.data = json_str + return api_client.deserialize(thing, entity_type) + -- cgit v1.2.3