diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-11 16:38:51 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-03-11 16:38:51 -0700 | 
| commit | 655f7060eb5b5e711a8a892cb1085639c4aa8fd2 (patch) | |
| tree | ffa1139e0c56b6510ec71d1aa8cc426423449f11 | |
| parent | c937447f894cfde54628fecf3fa71127cb769f0c (diff) | |
| download | fatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.tar.gz fatcat-655f7060eb5b5e711a8a892cb1085639c4aa8fd2.zip | |
refactor transforms into sub-dir
| -rwxr-xr-x | python/fatcat_export.py | 115 | ||||
| -rw-r--r-- | python/fatcat_tools/__init__.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/__init__.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/csl.py | 170 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py (renamed from python/fatcat_tools/transforms.py) | 190 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/entities.py | 31 | ||||
| -rwxr-xr-x | python/fatcat_transform.py | 149 | 
7 files changed, 356 insertions, 307 deletions
| diff --git a/python/fatcat_export.py b/python/fatcat_export.py index a9d46142..e3c141fd 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -11,18 +11,11 @@ import sys  import json  import argparse -from citeproc import CitationStylesStyle, CitationStylesBibliography -from citeproc import Citation, CitationItem -from citeproc import formatter -from citeproc.source.json import CiteProcJSON -from citeproc_styles import get_style_filepath -  import fatcat_client  from fatcat_client.rest import ApiException  from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry  from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ -    release_to_elasticsearch, container_to_elasticsearch, \ -    changelog_to_elasticsearch, public_api, release_to_csl +    public_api  def run_export_releases(args): @@ -32,70 +25,6 @@ def run_export_releases(args):          args.json_output.write(              json.dumps(entity_to_dict(release), api_client=args.api.api_client) + "\n") -def run_transform_releases(args): -    for line in args.json_input: -        line = line.strip() -        if not line: -            continue -        entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) -        args.json_output.write( -            json.dumps(release_to_elasticsearch(entity)) + '\n') - -def run_transform_containers(args): -    for line in args.json_input: -        line = line.strip() -        if not line: -            continue -        entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client) -        args.json_output.write( -            json.dumps(container_to_elasticsearch(entity)) + '\n') - -def run_transform_changelogs(args): -    for line in args.json_input: -        line = line.strip() -        if not line: -            continue -        entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client) -        args.json_output.write( -            json.dumps(changelog_to_elasticsearch(entity)) + '\n') - -def run_citeproc_releases(args): -    for line in args.json_input: -        line = line.strip() -        if not line: -            continue -        entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) -        csl_json = release_to_csl(entity) -        # XXX: -        csl_json['id'] = "release:" + (entity.ident or "unknown") -        if args.style == "csl-json": -            args.json_output.write(json.dumps(csl_json) + "\n") -            continue -        bib_src = CiteProcJSON([csl_json]) -        form = formatter.plain -        if args.html: -            form = formatter.html -        style_path = get_style_filepath(args.style) -        bib_style = CitationStylesStyle(style_path, validate=False) -        bib = CitationStylesBibliography(bib_style, bib_src, form) -        bib.register(Citation([CitationItem(csl_json['id'])])) -        # XXX: -        #args.json_output.write( -        #    json.dumps(release_to_csl(entity)) + '\n') -        lines = bib.bibliography()[0] -        if args.style == "bibtex": -            for l in lines: -                if l.startswith(" @"): -                    args.json_output.write("\n@") -                elif l.startswith(" "): -                    #print("line: START|{}|END".format(l)) -                    args.json_output.write("\n  " + l) -                else: -                    args.json_output.write(l) -        else: -            args.json_output.write(''.join(lines) + "\n") -        print() -  def run_export_changelog(args):      end = args.end      if end is None: @@ -126,48 +55,6 @@ def main():          help="where to send output",          default=sys.stdout, type=argparse.FileType('w')) -    sub_transform_releases = subparsers.add_parser('transform-releases') -    sub_transform_releases.set_defaults(func=run_transform_releases) -    sub_transform_releases.add_argument('json_input', -        help="JSON-per-line of release entities", -        default=sys.stdin, type=argparse.FileType('r')) -    sub_transform_releases.add_argument('json_output', -        help="where to send output", -        default=sys.stdout, type=argparse.FileType('w')) - -    sub_transform_containers = subparsers.add_parser('transform-containers') -    sub_transform_containers.set_defaults(func=run_transform_containers) -    sub_transform_containers.add_argument('json_input', -        help="JSON-per-line of container entities", -        default=sys.stdin, type=argparse.FileType('r')) -    sub_transform_containers.add_argument('json_output', -        help="where to send output", -        default=sys.stdout, type=argparse.FileType('w')) - -    sub_transform_changelogs = subparsers.add_parser('transform-changelogs') -    sub_transform_changelogs.set_defaults(func=run_transform_changelogs) -    sub_transform_changelogs.add_argument('json_input', -        help="JSON-per-line of changelog entries", -        default=sys.stdin, type=argparse.FileType('r')) -    sub_transform_changelogs.add_argument('json_output', -        help="where to send output", -        default=sys.stdout, type=argparse.FileType('w')) - -    sub_citeproc_releases = subparsers.add_parser('citeproc-releases') -    sub_citeproc_releases.set_defaults(func=run_citeproc_releases) -    sub_citeproc_releases.add_argument('json_input', -        help="JSON-per-line of release entities", -        default=sys.stdin, type=argparse.FileType('r')) -    sub_citeproc_releases.add_argument('json_output', -        help="where to send output", -        default=sys.stdout, type=argparse.FileType('w')) -    sub_citeproc_releases.add_argument('--style', -        help="citation style to output", -        default='csl-json') -    sub_citeproc_releases.add_argument('--html', -        action='store_true', -        help="output HTML, not plain text") -      sub_changelog = subparsers.add_parser('changelog')      sub_changelog.set_defaults(func=run_export_changelog)      sub_changelog.add_argument('--start', diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index c72ccd47..f2798f0b 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -1,6 +1,4 @@  from .api_auth import authenticated_api, public_api  from .fcid import fcid2uuid, uuid2fcid -from .transforms import entity_to_dict, entity_from_json, \ -    release_to_elasticsearch, container_to_elasticsearch, \ -    changelog_to_elasticsearch, release_to_csl +from .transforms import * diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py new file mode 100644 index 00000000..4950433b --- /dev/null +++ b/python/fatcat_tools/transforms/__init__.py @@ -0,0 +1,4 @@ + +from .entities import entity_to_dict, entity_from_json +from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch +from .csl import release_to_csl diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py new file mode 100644 index 00000000..f9615b26 --- /dev/null +++ b/python/fatcat_tools/transforms/csl.py @@ -0,0 +1,170 @@ + + +import collections +from fatcat_client import ApiClient + + +def contribs_by_role(contribs, role): +    ret = [c.copy() for c in contribs if c['role'] == role] +    [c.pop('role') for c in ret] +    # XXX: +    [c.pop('literal') for c in ret] +    if not ret: +        return None +    else: +        return ret + + +def release_to_csl(entity): +    """ +    Returns a python dict which can be json.dumps() to get a CSL-JSON (aka, +    citeproc-JSON, aka Citation Style Language JSON) + +    This function will likely become an API method/endpoint + +    Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json +    """ +    contribs = [] +    for contrib in (entity.contribs or []): +        if contrib.creator: +            # TODO: should we actually be pulling creator metadata? or just +            # using release-local raw metadata? +            c = dict( +                family=contrib.creator.surname, +                given=contrib.creator.given_name, +                #dropping-particle +                #non-dropping-particle +                #suffix +                #comma-suffix +                #static-ordering +                literal=contrib.raw_name, # or display_name? +                #parse-names, +                role=contrib.role, +            ) +        else: +            c = dict( +                # XXX: possible inclusion of full name metadata in release_contrib +                family=contrib.raw_name.split()[-1], +                literal=contrib.raw_name, +                role=contrib.role, +            ) +        for k in list(c.keys()): +            if not c[k]: +                c.pop(k) +        contribs.append(c) +    abstract = None +    if entity.abstracts: +        abstract = entity.abstracts[0].content + +    issued_date = None +    if entity.release_date: +        issued_date = {"date-parts": [[ +            entity.release_date.year, +            entity.release_date.month, +            entity.release_date.day, +        ]]} +    elif entity.release_year: +        issued_date = {"date-parts": [[entity.release_year]]} + +    csl = dict( +        #id, +        #categories +        type=entity.release_type or "article", # XXX: can't be blank +        language=entity.language, +        #journalAbbreviation +        #shortTitle +        ## see below for all contrib roles +        #accessed +        #container +        #event-date +        issued=issued_date, +        #original-date +        #submitted +        abstract=abstract, +        #annote +        #archive +        #archive_location +        #archive-place +        #authority +        #call-number +        #chapter-number +        #citation-number +        #citation-label +        #collection-number +        #collection-title +        container_title=entity.container and entity.container.name, +        #container-title-short +        #dimensions +        DOI=entity.doi, +        #edition +        #event +        #event-place +        #first-reference-note-number +        #genre +        ISBN=entity.isbn13, +        ISSN=entity.container and entity.container.issnl, +        issue=entity.issue, +        #jurisdiction +        #keyword +        #locator +        #medium +        #note +        #number +        #number-of-pages +        #number-of-volumes +        #original-publisher +        #original-publisher-place +        #original-title +        # XXX: page=entity.pages, +        page_first=entity.pages.split('-')[0], +        PMCID=entity.pmcid, +        PMID=entity.pmid, +        publisher=(entity.container and entity.container.publisher) or entity.publisher, +        #publisher-place +        #references +        #reviewed-title +        #scale +        #section +        #source +        #status +        title=entity.title, +        #title-short +        #URL +        #version +        volume=entity.volume, +        #year-suffix +    ) +    for role in ['author', 'collection-editor', 'composer', 'container-author', +            'director', 'editor', 'editorial-director', 'interviewer', +            'illustrator', 'original-author', 'recipient', 'reviewed-author', +            'translator']: +        cbr = contribs_by_role(contribs, role) +        if cbr: +            csl[role] = cbr +    # underline-to-dash +    csl['container-title'] = csl.pop('container_title') +    csl['page-first'] = csl.pop('page_first') +    empty_keys = [k for k,v in csl.items() if not v] +    for k in empty_keys: +        csl.pop(k) +    return csl + + +def refs_to_csl(entity): +    ret = [] +    for ref in entity.refs: +        if ref.release_id and False: +            # TODO: fetch full entity from API and convert with release_to_csl +            raise NotImplementedError +        else: +            issued_date = None +            if ref.year: +                issued_date = [[ref.year]] +            csl = dict( +                title=ref.title, +                issued=issued_date, +            ) +        csl['id'] = ref.key or ref.index, # zero- or one-indexed? +        ret.append(csl) +    return ret + diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms/elasticsearch.py index f49b5ac9..0c2c5e46 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -3,31 +3,6 @@  import collections  from fatcat_client import ApiClient -def entity_to_dict(entity, api_client=None): -    """ -    Hack to take advantage of the code-generated serialization code. - -    Initializing/destroying ApiClient objects is surprisingly expensive -    (because it involves a threadpool), so we allow passing an existing -    instance. If you already have a full-on API connection `api`, you can -    access the ApiClient object as `api.api_client`. This is such a speed-up -    that this argument may become mandatory. -    """ -    if not api_client: -        api_client = ApiClient() -    return api_client.sanitize_for_serialization(entity) - -def entity_from_json(json_str, entity_type, api_client=None): -    """ -    Hack to take advantage of the code-generated deserialization code - -    See not on `entity_to_dict()` about api_client argument. -    """ -    if not api_client: -        api_client = ApiClient() -    thing = collections.namedtuple('Thing', ['data']) -    thing.data = json_str -    return api_client.deserialize(thing, entity_type)  def check_kbart(year, archive):      if not archive or not archive.get('year_spans'): @@ -319,171 +294,6 @@ def container_to_elasticsearch(entity, force_bool=True):      return t -def contribs_by_role(contribs, role): -    ret = [c.copy() for c in contribs if c['role'] == role] -    [c.pop('role') for c in ret] -    # XXX: -    [c.pop('literal') for c in ret] -    if not ret: -        return None -    else: -        return ret - - -def release_to_csl(entity): -    """ -    Returns a python dict which can be json.dumps() to get a CSL-JSON (aka, -    citeproc-JSON, aka Citation Style Language JSON) - -    This function will likely become an API method/endpoint - -    Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json -    """ -    contribs = [] -    for contrib in (entity.contribs or []): -        if contrib.creator: -            # TODO: should we actually be pulling creator metadata? or just -            # using release-local raw metadata? -            c = dict( -                family=contrib.creator.surname, -                given=contrib.creator.given_name, -                #dropping-particle -                #non-dropping-particle -                #suffix -                #comma-suffix -                #static-ordering -                literal=contrib.raw_name, # or display_name? -                #parse-names, -                role=contrib.role, -            ) -        else: -            c = dict( -                # XXX: possible inclusion of full name metadata in release_contrib -                family=contrib.raw_name.split()[-1], -                literal=contrib.raw_name, -                role=contrib.role, -            ) -        for k in list(c.keys()): -            if not c[k]: -                c.pop(k) -        contribs.append(c) -    abstract = None -    if entity.abstracts: -        abstract = entity.abstracts[0].content - -    issued_date = None -    if entity.release_date: -        issued_date = {"date-parts": [[ -            entity.release_date.year, -            entity.release_date.month, -            entity.release_date.day, -        ]]} -    elif entity.release_year: -        issued_date = {"date-parts": [[entity.release_year]]} - -    csl = dict( -        #id, -        #categories -        type=entity.release_type or "article", # XXX: can't be blank -        language=entity.language, -        #journalAbbreviation -        #shortTitle -        ## see below for all contrib roles -        #accessed -        #container -        #event-date -        issued=issued_date, -        #original-date -        #submitted -        abstract=abstract, -        #annote -        #archive -        #archive_location -        #archive-place -        #authority -        #call-number -        #chapter-number -        #citation-number -        #citation-label -        #collection-number -        #collection-title -        container_title=entity.container and entity.container.name, -        #container-title-short -        #dimensions -        DOI=entity.doi, -        #edition -        #event -        #event-place -        #first-reference-note-number -        #genre -        ISBN=entity.isbn13, -        ISSN=entity.container and entity.container.issnl, -        issue=entity.issue, -        #jurisdiction -        #keyword -        #locator -        #medium -        #note -        #number -        #number-of-pages -        #number-of-volumes -        #original-publisher -        #original-publisher-place -        #original-title -        # XXX: page=entity.pages, -        page_first=entity.pages.split('-')[0], -        PMCID=entity.pmcid, -        PMID=entity.pmid, -        publisher=(entity.container and entity.container.publisher) or entity.publisher, -        #publisher-place -        #references -        #reviewed-title -        #scale -        #section -        #source -        #status -        title=entity.title, -        #title-short -        #URL -        #version -        volume=entity.volume, -        #year-suffix -    ) -    for role in ['author', 'collection-editor', 'composer', 'container-author', -            'director', 'editor', 'editorial-director', 'interviewer', -            'illustrator', 'original-author', 'recipient', 'reviewed-author', -            'translator']: -        cbr = contribs_by_role(contribs, role) -        if cbr: -            csl[role] = cbr -    # underline-to-dash -    csl['container-title'] = csl.pop('container_title') -    csl['page-first'] = csl.pop('page_first') -    empty_keys = [k for k,v in csl.items() if not v] -    for k in empty_keys: -        csl.pop(k) -    return csl - - -def refs_to_csl(entity): -    ret = [] -    for ref in entity.refs: -        if ref.release_id and False: -            # TODO: fetch full entity from API and convert with release_to_csl -            raise NotImplementedError -        else: -            issued_date = None -            if ref.year: -                issued_date = [[ref.year]] -            csl = dict( -                title=ref.title, -                issued=issued_date, -            ) -        csl['id'] = ref.key or ref.index, # zero- or one-indexed? -        ret.append(csl) -    return ret - -  def changelog_to_elasticsearch(entity):      editgroup = entity.editgroup diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py new file mode 100644 index 00000000..b67df12d --- /dev/null +++ b/python/fatcat_tools/transforms/entities.py @@ -0,0 +1,31 @@ + + +import collections +from fatcat_client import ApiClient + +def entity_to_dict(entity, api_client=None): +    """ +    Hack to take advantage of the code-generated serialization code. + +    Initializing/destroying ApiClient objects is surprisingly expensive +    (because it involves a threadpool), so we allow passing an existing +    instance. If you already have a full-on API connection `api`, you can +    access the ApiClient object as `api.api_client`. This is such a speed-up +    that this argument may become mandatory. +    """ +    if not api_client: +        api_client = ApiClient() +    return api_client.sanitize_for_serialization(entity) + +def entity_from_json(json_str, entity_type, api_client=None): +    """ +    Hack to take advantage of the code-generated deserialization code + +    See not on `entity_to_dict()` about api_client argument. +    """ +    if not api_client: +        api_client = ApiClient() +    thing = collections.namedtuple('Thing', ['data']) +    thing.data = json_str +    return api_client.deserialize(thing, entity_type) + diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py new file mode 100755 index 00000000..8d5c34c5 --- /dev/null +++ b/python/fatcat_transform.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 + +""" +""" + +import sys +import json +import argparse + +from citeproc import CitationStylesStyle, CitationStylesBibliography +from citeproc import Citation, CitationItem +from citeproc import formatter +from citeproc.source.json import CiteProcJSON +from citeproc_styles import get_style_filepath + +import fatcat_client +from fatcat_client.rest import ApiException +from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry +from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ +    release_to_elasticsearch, container_to_elasticsearch, \ +    changelog_to_elasticsearch, public_api, release_to_csl + + +def run_transform_releases(args): +    for line in args.json_input: +        line = line.strip() +        if not line: +            continue +        entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) +        args.json_output.write( +            json.dumps(release_to_elasticsearch(entity)) + '\n') + +def run_transform_containers(args): +    for line in args.json_input: +        line = line.strip() +        if not line: +            continue +        entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client) +        args.json_output.write( +            json.dumps(container_to_elasticsearch(entity)) + '\n') + +def run_transform_changelogs(args): +    for line in args.json_input: +        line = line.strip() +        if not line: +            continue +        entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client) +        args.json_output.write( +            json.dumps(changelog_to_elasticsearch(entity)) + '\n') + +def run_citeproc_releases(args): +    for line in args.json_input: +        line = line.strip() +        if not line: +            continue +        entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) +        csl_json = release_to_csl(entity) +        # XXX: +        csl_json['id'] = "release:" + (entity.ident or "unknown") +        if args.style == "csl-json": +            args.json_output.write(json.dumps(csl_json) + "\n") +            continue +        bib_src = CiteProcJSON([csl_json]) +        form = formatter.plain +        if args.html: +            form = formatter.html +        style_path = get_style_filepath(args.style) +        bib_style = CitationStylesStyle(style_path, validate=False) +        bib = CitationStylesBibliography(bib_style, bib_src, form) +        bib.register(Citation([CitationItem(csl_json['id'])])) +        # XXX: +        #args.json_output.write( +        #    json.dumps(release_to_csl(entity)) + '\n') +        lines = bib.bibliography()[0] +        if args.style == "bibtex": +            for l in lines: +                if l.startswith(" @"): +                    args.json_output.write("\n@") +                elif l.startswith(" "): +                    #print("line: START|{}|END".format(l)) +                    args.json_output.write("\n  " + l) +                else: +                    args.json_output.write(l) +        else: +            args.json_output.write(''.join(lines) + "\n") +        print() + +def main(): +    parser = argparse.ArgumentParser() +    parser.add_argument('--debug', +        action='store_true', +        help="enable debugging interface") +    parser.add_argument('--host-url', +        default="http://localhost:9411/v0", +        help="connect to this host/port") +    subparsers = parser.add_subparsers() + +    sub_transform_releases = subparsers.add_parser('transform-releases') +    sub_transform_releases.set_defaults(func=run_transform_releases) +    sub_transform_releases.add_argument('json_input', +        help="JSON-per-line of release entities", +        default=sys.stdin, type=argparse.FileType('r')) +    sub_transform_releases.add_argument('json_output', +        help="where to send output", +        default=sys.stdout, type=argparse.FileType('w')) + +    sub_transform_containers = subparsers.add_parser('transform-containers') +    sub_transform_containers.set_defaults(func=run_transform_containers) +    sub_transform_containers.add_argument('json_input', +        help="JSON-per-line of container entities", +        default=sys.stdin, type=argparse.FileType('r')) +    sub_transform_containers.add_argument('json_output', +        help="where to send output", +        default=sys.stdout, type=argparse.FileType('w')) + +    sub_transform_changelogs = subparsers.add_parser('transform-changelogs') +    sub_transform_changelogs.set_defaults(func=run_transform_changelogs) +    sub_transform_changelogs.add_argument('json_input', +        help="JSON-per-line of changelog entries", +        default=sys.stdin, type=argparse.FileType('r')) +    sub_transform_changelogs.add_argument('json_output', +        help="where to send output", +        default=sys.stdout, type=argparse.FileType('w')) + +    sub_citeproc_releases = subparsers.add_parser('citeproc-releases') +    sub_citeproc_releases.set_defaults(func=run_citeproc_releases) +    sub_citeproc_releases.add_argument('json_input', +        help="JSON-per-line of release entities", +        default=sys.stdin, type=argparse.FileType('r')) +    sub_citeproc_releases.add_argument('json_output', +        help="where to send output", +        default=sys.stdout, type=argparse.FileType('w')) +    sub_citeproc_releases.add_argument('--style', +        help="citation style to output", +        default='csl-json') +    sub_citeproc_releases.add_argument('--html', +        action='store_true', +        help="output HTML, not plain text") + +    args = parser.parse_args() +    if not args.__dict__.get("func"): +        print("tell me what to do!") +        sys.exit(-1) + +    args.api = public_api(args.host_url) +    args.func(args) + +if __name__ == '__main__': +    main() | 
