aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-03-11 16:38:51 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-03-11 16:38:51 -0700
commit655f7060eb5b5e711a8a892cb1085639c4aa8fd2 (patch)
treeffa1139e0c56b6510ec71d1aa8cc426423449f11 /python/fatcat_tools
parentc937447f894cfde54628fecf3fa71127cb769f0c (diff)
downloadfatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.tar.gz
fatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.zip
refactor transforms into sub-dir
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/__init__.py4
-rw-r--r--python/fatcat_tools/transforms/__init__.py4
-rw-r--r--python/fatcat_tools/transforms/csl.py170
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py (renamed from python/fatcat_tools/transforms.py)190
-rw-r--r--python/fatcat_tools/transforms/entities.py31
5 files changed, 206 insertions, 193 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
index c72ccd47..f2798f0b 100644
--- a/python/fatcat_tools/__init__.py
+++ b/python/fatcat_tools/__init__.py
@@ -1,6 +1,4 @@
from .api_auth import authenticated_api, public_api
from .fcid import fcid2uuid, uuid2fcid
-from .transforms import entity_to_dict, entity_from_json, \
- release_to_elasticsearch, container_to_elasticsearch, \
- changelog_to_elasticsearch, release_to_csl
+from .transforms import *
diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py
new file mode 100644
index 00000000..4950433b
--- /dev/null
+++ b/python/fatcat_tools/transforms/__init__.py
@@ -0,0 +1,4 @@
+
+from .entities import entity_to_dict, entity_from_json
+from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch
+from .csl import release_to_csl
diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py
new file mode 100644
index 00000000..f9615b26
--- /dev/null
+++ b/python/fatcat_tools/transforms/csl.py
@@ -0,0 +1,170 @@
+
+
+import collections
+from fatcat_client import ApiClient
+
+
+def contribs_by_role(contribs, role):
+ ret = [c.copy() for c in contribs if c['role'] == role]
+ [c.pop('role') for c in ret]
+ # XXX:
+ [c.pop('literal') for c in ret]
+ if not ret:
+ return None
+ else:
+ return ret
+
+
+def release_to_csl(entity):
+ """
+ Returns a python dict which can be json.dumps() to get a CSL-JSON (aka,
+ citeproc-JSON, aka Citation Style Language JSON)
+
+ This function will likely become an API method/endpoint
+
+ Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json
+ """
+ contribs = []
+ for contrib in (entity.contribs or []):
+ if contrib.creator:
+ # TODO: should we actually be pulling creator metadata? or just
+ # using release-local raw metadata?
+ c = dict(
+ family=contrib.creator.surname,
+ given=contrib.creator.given_name,
+ #dropping-particle
+ #non-dropping-particle
+ #suffix
+ #comma-suffix
+ #static-ordering
+ literal=contrib.raw_name, # or display_name?
+ #parse-names,
+ role=contrib.role,
+ )
+ else:
+ c = dict(
+ # XXX: possible inclusion of full name metadata in release_contrib
+ family=contrib.raw_name.split()[-1],
+ literal=contrib.raw_name,
+ role=contrib.role,
+ )
+ for k in list(c.keys()):
+ if not c[k]:
+ c.pop(k)
+ contribs.append(c)
+ abstract = None
+ if entity.abstracts:
+ abstract = entity.abstracts[0].content
+
+ issued_date = None
+ if entity.release_date:
+ issued_date = {"date-parts": [[
+ entity.release_date.year,
+ entity.release_date.month,
+ entity.release_date.day,
+ ]]}
+ elif entity.release_year:
+ issued_date = {"date-parts": [[entity.release_year]]}
+
+ csl = dict(
+ #id,
+ #categories
+ type=entity.release_type or "article", # XXX: can't be blank
+ language=entity.language,
+ #journalAbbreviation
+ #shortTitle
+ ## see below for all contrib roles
+ #accessed
+ #container
+ #event-date
+ issued=issued_date,
+ #original-date
+ #submitted
+ abstract=abstract,
+ #annote
+ #archive
+ #archive_location
+ #archive-place
+ #authority
+ #call-number
+ #chapter-number
+ #citation-number
+ #citation-label
+ #collection-number
+ #collection-title
+ container_title=entity.container and entity.container.name,
+ #container-title-short
+ #dimensions
+ DOI=entity.doi,
+ #edition
+ #event
+ #event-place
+ #first-reference-note-number
+ #genre
+ ISBN=entity.isbn13,
+ ISSN=entity.container and entity.container.issnl,
+ issue=entity.issue,
+ #jurisdiction
+ #keyword
+ #locator
+ #medium
+ #note
+ #number
+ #number-of-pages
+ #number-of-volumes
+ #original-publisher
+ #original-publisher-place
+ #original-title
+ # XXX: page=entity.pages,
+ page_first=entity.pages.split('-')[0],
+ PMCID=entity.pmcid,
+ PMID=entity.pmid,
+ publisher=(entity.container and entity.container.publisher) or entity.publisher,
+ #publisher-place
+ #references
+ #reviewed-title
+ #scale
+ #section
+ #source
+ #status
+ title=entity.title,
+ #title-short
+ #URL
+ #version
+ volume=entity.volume,
+ #year-suffix
+ )
+ for role in ['author', 'collection-editor', 'composer', 'container-author',
+ 'director', 'editor', 'editorial-director', 'interviewer',
+ 'illustrator', 'original-author', 'recipient', 'reviewed-author',
+ 'translator']:
+ cbr = contribs_by_role(contribs, role)
+ if cbr:
+ csl[role] = cbr
+ # underline-to-dash
+ csl['container-title'] = csl.pop('container_title')
+ csl['page-first'] = csl.pop('page_first')
+ empty_keys = [k for k,v in csl.items() if not v]
+ for k in empty_keys:
+ csl.pop(k)
+ return csl
+
+
+def refs_to_csl(entity):
+ ret = []
+ for ref in entity.refs:
+ if ref.release_id and False:
+ # TODO: fetch full entity from API and convert with release_to_csl
+ raise NotImplementedError
+ else:
+ issued_date = None
+ if ref.year:
+ issued_date = [[ref.year]]
+ csl = dict(
+ title=ref.title,
+ issued=issued_date,
+ )
+ csl['id'] = ref.key or ref.index, # zero- or one-indexed?
+ ret.append(csl)
+ return ret
+
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms/elasticsearch.py
index f49b5ac9..0c2c5e46 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -3,31 +3,6 @@
import collections
from fatcat_client import ApiClient
-def entity_to_dict(entity, api_client=None):
- """
- Hack to take advantage of the code-generated serialization code.
-
- Initializing/destroying ApiClient objects is surprisingly expensive
- (because it involves a threadpool), so we allow passing an existing
- instance. If you already have a full-on API connection `api`, you can
- access the ApiClient object as `api.api_client`. This is such a speed-up
- that this argument may become mandatory.
- """
- if not api_client:
- api_client = ApiClient()
- return api_client.sanitize_for_serialization(entity)
-
-def entity_from_json(json_str, entity_type, api_client=None):
- """
- Hack to take advantage of the code-generated deserialization code
-
- See not on `entity_to_dict()` about api_client argument.
- """
- if not api_client:
- api_client = ApiClient()
- thing = collections.namedtuple('Thing', ['data'])
- thing.data = json_str
- return api_client.deserialize(thing, entity_type)
def check_kbart(year, archive):
if not archive or not archive.get('year_spans'):
@@ -319,171 +294,6 @@ def container_to_elasticsearch(entity, force_bool=True):
return t
-def contribs_by_role(contribs, role):
- ret = [c.copy() for c in contribs if c['role'] == role]
- [c.pop('role') for c in ret]
- # XXX:
- [c.pop('literal') for c in ret]
- if not ret:
- return None
- else:
- return ret
-
-
-def release_to_csl(entity):
- """
- Returns a python dict which can be json.dumps() to get a CSL-JSON (aka,
- citeproc-JSON, aka Citation Style Language JSON)
-
- This function will likely become an API method/endpoint
-
- Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json
- """
- contribs = []
- for contrib in (entity.contribs or []):
- if contrib.creator:
- # TODO: should we actually be pulling creator metadata? or just
- # using release-local raw metadata?
- c = dict(
- family=contrib.creator.surname,
- given=contrib.creator.given_name,
- #dropping-particle
- #non-dropping-particle
- #suffix
- #comma-suffix
- #static-ordering
- literal=contrib.raw_name, # or display_name?
- #parse-names,
- role=contrib.role,
- )
- else:
- c = dict(
- # XXX: possible inclusion of full name metadata in release_contrib
- family=contrib.raw_name.split()[-1],
- literal=contrib.raw_name,
- role=contrib.role,
- )
- for k in list(c.keys()):
- if not c[k]:
- c.pop(k)
- contribs.append(c)
- abstract = None
- if entity.abstracts:
- abstract = entity.abstracts[0].content
-
- issued_date = None
- if entity.release_date:
- issued_date = {"date-parts": [[
- entity.release_date.year,
- entity.release_date.month,
- entity.release_date.day,
- ]]}
- elif entity.release_year:
- issued_date = {"date-parts": [[entity.release_year]]}
-
- csl = dict(
- #id,
- #categories
- type=entity.release_type or "article", # XXX: can't be blank
- language=entity.language,
- #journalAbbreviation
- #shortTitle
- ## see below for all contrib roles
- #accessed
- #container
- #event-date
- issued=issued_date,
- #original-date
- #submitted
- abstract=abstract,
- #annote
- #archive
- #archive_location
- #archive-place
- #authority
- #call-number
- #chapter-number
- #citation-number
- #citation-label
- #collection-number
- #collection-title
- container_title=entity.container and entity.container.name,
- #container-title-short
- #dimensions
- DOI=entity.doi,
- #edition
- #event
- #event-place
- #first-reference-note-number
- #genre
- ISBN=entity.isbn13,
- ISSN=entity.container and entity.container.issnl,
- issue=entity.issue,
- #jurisdiction
- #keyword
- #locator
- #medium
- #note
- #number
- #number-of-pages
- #number-of-volumes
- #original-publisher
- #original-publisher-place
- #original-title
- # XXX: page=entity.pages,
- page_first=entity.pages.split('-')[0],
- PMCID=entity.pmcid,
- PMID=entity.pmid,
- publisher=(entity.container and entity.container.publisher) or entity.publisher,
- #publisher-place
- #references
- #reviewed-title
- #scale
- #section
- #source
- #status
- title=entity.title,
- #title-short
- #URL
- #version
- volume=entity.volume,
- #year-suffix
- )
- for role in ['author', 'collection-editor', 'composer', 'container-author',
- 'director', 'editor', 'editorial-director', 'interviewer',
- 'illustrator', 'original-author', 'recipient', 'reviewed-author',
- 'translator']:
- cbr = contribs_by_role(contribs, role)
- if cbr:
- csl[role] = cbr
- # underline-to-dash
- csl['container-title'] = csl.pop('container_title')
- csl['page-first'] = csl.pop('page_first')
- empty_keys = [k for k,v in csl.items() if not v]
- for k in empty_keys:
- csl.pop(k)
- return csl
-
-
-def refs_to_csl(entity):
- ret = []
- for ref in entity.refs:
- if ref.release_id and False:
- # TODO: fetch full entity from API and convert with release_to_csl
- raise NotImplementedError
- else:
- issued_date = None
- if ref.year:
- issued_date = [[ref.year]]
- csl = dict(
- title=ref.title,
- issued=issued_date,
- )
- csl['id'] = ref.key or ref.index, # zero- or one-indexed?
- ret.append(csl)
- return ret
-
-
def changelog_to_elasticsearch(entity):
editgroup = entity.editgroup
diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py
new file mode 100644
index 00000000..b67df12d
--- /dev/null
+++ b/python/fatcat_tools/transforms/entities.py
@@ -0,0 +1,31 @@
+
+
+import collections
+from fatcat_client import ApiClient
+
+def entity_to_dict(entity, api_client=None):
+ """
+ Hack to take advantage of the code-generated serialization code.
+
+ Initializing/destroying ApiClient objects is surprisingly expensive
+ (because it involves a threadpool), so we allow passing an existing
+ instance. If you already have a full-on API connection `api`, you can
+ access the ApiClient object as `api.api_client`. This is such a speed-up
+ that this argument may become mandatory.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ return api_client.sanitize_for_serialization(entity)
+
+def entity_from_json(json_str, entity_type, api_client=None):
+ """
+ Hack to take advantage of the code-generated deserialization code
+
+ See not on `entity_to_dict()` about api_client argument.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ thing = collections.namedtuple('Thing', ['data'])
+ thing.data = json_str
+ return api_client.deserialize(thing, entity_type)
+