diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-11 16:38:51 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-11 16:38:51 -0700 | 
| commit | 655f7060eb5b5e711a8a892cb1085639c4aa8fd2 (patch) | |
| tree | ffa1139e0c56b6510ec71d1aa8cc426423449f11 /python/fatcat_tools | |
| parent | c937447f894cfde54628fecf3fa71127cb769f0c (diff) | |
| download | fatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.tar.gz fatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.zip | |
refactor transforms into sub-dir
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/__init__.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/__init__.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/csl.py | 170 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py (renamed from python/fatcat_tools/transforms.py) | 190 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/entities.py | 31 | 
5 files changed, 206 insertions, 193 deletions
| diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index c72ccd47..f2798f0b 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -1,6 +1,4 @@  from .api_auth import authenticated_api, public_api  from .fcid import fcid2uuid, uuid2fcid -from .transforms import entity_to_dict, entity_from_json, \ -    release_to_elasticsearch, container_to_elasticsearch, \ -    changelog_to_elasticsearch, release_to_csl +from .transforms import * diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py new file mode 100644 index 00000000..4950433b --- /dev/null +++ b/python/fatcat_tools/transforms/__init__.py @@ -0,0 +1,4 @@ + +from .entities import entity_to_dict, entity_from_json +from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch +from .csl import release_to_csl diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py new file mode 100644 index 00000000..f9615b26 --- /dev/null +++ b/python/fatcat_tools/transforms/csl.py @@ -0,0 +1,170 @@ + + +import collections +from fatcat_client import ApiClient + + +def contribs_by_role(contribs, role): +    ret = [c.copy() for c in contribs if c['role'] == role] +    [c.pop('role') for c in ret] +    # XXX: +    [c.pop('literal') for c in ret] +    if not ret: +        return None +    else: +        return ret + + +def release_to_csl(entity): +    """ +    Returns a python dict which can be json.dumps() to get a CSL-JSON (aka, +    citeproc-JSON, aka Citation Style Language JSON) + +    This function will likely become an API method/endpoint + +    Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json +    """ +    contribs = [] +    for contrib in (entity.contribs or []): +        if contrib.creator: +            # TODO: should we actually be pulling creator metadata? or just +            # using release-local raw metadata? +            c = dict( +                family=contrib.creator.surname, +                given=contrib.creator.given_name, +                #dropping-particle +                #non-dropping-particle +                #suffix +                #comma-suffix +                #static-ordering +                literal=contrib.raw_name, # or display_name? +                #parse-names, +                role=contrib.role, +            ) +        else: +            c = dict( +                # XXX: possible inclusion of full name metadata in release_contrib +                family=contrib.raw_name.split()[-1], +                literal=contrib.raw_name, +                role=contrib.role, +            ) +        for k in list(c.keys()): +            if not c[k]: +                c.pop(k) +        contribs.append(c) +    abstract = None +    if entity.abstracts: +        abstract = entity.abstracts[0].content + +    issued_date = None +    if entity.release_date: +        issued_date = {"date-parts": [[ +            entity.release_date.year, +            entity.release_date.month, +            entity.release_date.day, +        ]]} +    elif entity.release_year: +        issued_date = {"date-parts": [[entity.release_year]]} + +    csl = dict( +        #id, +        #categories +        type=entity.release_type or "article", # XXX: can't be blank +        language=entity.language, +        #journalAbbreviation +        #shortTitle +        ## see below for all contrib roles +        #accessed +        #container +        #event-date +        issued=issued_date, +        #original-date +        #submitted +        abstract=abstract, +        #annote +        #archive +        #archive_location +        #archive-place +        #authority +        #call-number +        #chapter-number +        #citation-number +        #citation-label +        #collection-number +        #collection-title +        container_title=entity.container and entity.container.name, +        #container-title-short +        #dimensions +        DOI=entity.doi, +        #edition +        #event +        #event-place +        #first-reference-note-number +        #genre +        ISBN=entity.isbn13, +        ISSN=entity.container and entity.container.issnl, +        issue=entity.issue, +        #jurisdiction +        #keyword +        #locator +        #medium +        #note +        #number +        #number-of-pages +        #number-of-volumes +        #original-publisher +        #original-publisher-place +        #original-title +        # XXX: page=entity.pages, +        page_first=entity.pages.split('-')[0], +        PMCID=entity.pmcid, +        PMID=entity.pmid, +        publisher=(entity.container and entity.container.publisher) or entity.publisher, +        #publisher-place +        #references +        #reviewed-title +        #scale +        #section +        #source +        #status +        title=entity.title, +        #title-short +        #URL +        #version +        volume=entity.volume, +        #year-suffix +    ) +    for role in ['author', 'collection-editor', 'composer', 'container-author', +            'director', 'editor', 'editorial-director', 'interviewer', +            'illustrator', 'original-author', 'recipient', 'reviewed-author', +            'translator']: +        cbr = contribs_by_role(contribs, role) +        if cbr: +            csl[role] = cbr +    # underline-to-dash +    csl['container-title'] = csl.pop('container_title') +    csl['page-first'] = csl.pop('page_first') +    empty_keys = [k for k,v in csl.items() if not v] +    for k in empty_keys: +        csl.pop(k) +    return csl + + +def refs_to_csl(entity): +    ret = [] +    for ref in entity.refs: +        if ref.release_id and False: +            # TODO: fetch full entity from API and convert with release_to_csl +            raise NotImplementedError +        else: +            issued_date = None +            if ref.year: +                issued_date = [[ref.year]] +            csl = dict( +                title=ref.title, +                issued=issued_date, +            ) +        csl['id'] = ref.key or ref.index, # zero- or one-indexed? +        ret.append(csl) +    return ret + diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms/elasticsearch.py index f49b5ac9..0c2c5e46 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -3,31 +3,6 @@  import collections  from fatcat_client import ApiClient -def entity_to_dict(entity, api_client=None): -    """ -    Hack to take advantage of the code-generated serialization code. - -    Initializing/destroying ApiClient objects is surprisingly expensive -    (because it involves a threadpool), so we allow passing an existing -    instance. If you already have a full-on API connection `api`, you can -    access the ApiClient object as `api.api_client`. This is such a speed-up -    that this argument may become mandatory. -    """ -    if not api_client: -        api_client = ApiClient() -    return api_client.sanitize_for_serialization(entity) - -def entity_from_json(json_str, entity_type, api_client=None): -    """ -    Hack to take advantage of the code-generated deserialization code - -    See not on `entity_to_dict()` about api_client argument. -    """ -    if not api_client: -        api_client = ApiClient() -    thing = collections.namedtuple('Thing', ['data']) -    thing.data = json_str -    return api_client.deserialize(thing, entity_type)  def check_kbart(year, archive):      if not archive or not archive.get('year_spans'): @@ -319,171 +294,6 @@ def container_to_elasticsearch(entity, force_bool=True):      return t -def contribs_by_role(contribs, role): -    ret = [c.copy() for c in contribs if c['role'] == role] -    [c.pop('role') for c in ret] -    # XXX: -    [c.pop('literal') for c in ret] -    if not ret: -        return None -    else: -        return ret - - -def release_to_csl(entity): -    """ -    Returns a python dict which can be json.dumps() to get a CSL-JSON (aka, -    citeproc-JSON, aka Citation Style Language JSON) - -    This function will likely become an API method/endpoint - -    Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json -    """ -    contribs = [] -    for contrib in (entity.contribs or []): -        if contrib.creator: -            # TODO: should we actually be pulling creator metadata? or just -            # using release-local raw metadata? -            c = dict( -                family=contrib.creator.surname, -                given=contrib.creator.given_name, -                #dropping-particle -                #non-dropping-particle -                #suffix -                #comma-suffix -                #static-ordering -                literal=contrib.raw_name, # or display_name? -                #parse-names, -                role=contrib.role, -            ) -        else: -            c = dict( -                # XXX: possible inclusion of full name metadata in release_contrib -                family=contrib.raw_name.split()[-1], -                literal=contrib.raw_name, -                role=contrib.role, -            ) -        for k in list(c.keys()): -            if not c[k]: -                c.pop(k) -        contribs.append(c) -    abstract = None -    if entity.abstracts: -        abstract = entity.abstracts[0].content - -    issued_date = None -    if entity.release_date: -        issued_date = {"date-parts": [[ -            entity.release_date.year, -            entity.release_date.month, -            entity.release_date.day, -        ]]} -    elif entity.release_year: -        issued_date = {"date-parts": [[entity.release_year]]} - -    csl = dict( -        #id, -        #categories -        type=entity.release_type or "article", # XXX: can't be blank -        language=entity.language, -        #journalAbbreviation -        #shortTitle -        ## see below for all contrib roles -        #accessed -        #container -        #event-date -        issued=issued_date, -        #original-date -        #submitted -        abstract=abstract, -        #annote -        #archive -        #archive_location -        #archive-place -        #authority -        #call-number -        #chapter-number -        #citation-number -        #citation-label -        #collection-number -        #collection-title -        container_title=entity.container and entity.container.name, -        #container-title-short -        #dimensions -        DOI=entity.doi, -        #edition -        #event -        #event-place -        #first-reference-note-number -        #genre -        ISBN=entity.isbn13, -        ISSN=entity.container and entity.container.issnl, -        issue=entity.issue, -        #jurisdiction -        #keyword -        #locator -        #medium -        #note -        #number -        #number-of-pages -        #number-of-volumes -        #original-publisher -        #original-publisher-place -        #original-title -        # XXX: page=entity.pages, -        page_first=entity.pages.split('-')[0], -        PMCID=entity.pmcid, -        PMID=entity.pmid, -        publisher=(entity.container and entity.container.publisher) or entity.publisher, -        #publisher-place -        #references -        #reviewed-title -        #scale -        #section -        #source -        #status -        title=entity.title, -        #title-short -        #URL -        #version -        volume=entity.volume, -        #year-suffix -    ) -    for role in ['author', 'collection-editor', 'composer', 'container-author', -            'director', 'editor', 'editorial-director', 'interviewer', -            'illustrator', 'original-author', 'recipient', 'reviewed-author', -            'translator']: -        cbr = contribs_by_role(contribs, role) -        if cbr: -            csl[role] = cbr -    # underline-to-dash -    csl['container-title'] = csl.pop('container_title') -    csl['page-first'] = csl.pop('page_first') -    empty_keys = [k for k,v in csl.items() if not v] -    for k in empty_keys: -        csl.pop(k) -    return csl - - -def refs_to_csl(entity): -    ret = [] -    for ref in entity.refs: -        if ref.release_id and False: -            # TODO: fetch full entity from API and convert with release_to_csl -            raise NotImplementedError -        else: -            issued_date = None -            if ref.year: -                issued_date = [[ref.year]] -            csl = dict( -                title=ref.title, -                issued=issued_date, -            ) -        csl['id'] = ref.key or ref.index, # zero- or one-indexed? -        ret.append(csl) -    return ret - -  def changelog_to_elasticsearch(entity):      editgroup = entity.editgroup diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py new file mode 100644 index 00000000..b67df12d --- /dev/null +++ b/python/fatcat_tools/transforms/entities.py @@ -0,0 +1,31 @@ + + +import collections +from fatcat_client import ApiClient + +def entity_to_dict(entity, api_client=None): +    """ +    Hack to take advantage of the code-generated serialization code. + +    Initializing/destroying ApiClient objects is surprisingly expensive +    (because it involves a threadpool), so we allow passing an existing +    instance. If you already have a full-on API connection `api`, you can +    access the ApiClient object as `api.api_client`. This is such a speed-up +    that this argument may become mandatory. +    """ +    if not api_client: +        api_client = ApiClient() +    return api_client.sanitize_for_serialization(entity) + +def entity_from_json(json_str, entity_type, api_client=None): +    """ +    Hack to take advantage of the code-generated deserialization code + +    See not on `entity_to_dict()` about api_client argument. +    """ +    if not api_client: +        api_client = ApiClient() +    thing = collections.namedtuple('Thing', ['data']) +    thing.data = json_str +    return api_client.deserialize(thing, entity_type) + | 
