diff options
-rwxr-xr-x | python/fatcat_export.py | 115 | ||||
-rw-r--r-- | python/fatcat_tools/__init__.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/__init__.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/csl.py | 170 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py (renamed from python/fatcat_tools/transforms.py) | 190 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/entities.py | 31 | ||||
-rwxr-xr-x | python/fatcat_transform.py | 149 |
7 files changed, 356 insertions, 307 deletions
diff --git a/python/fatcat_export.py b/python/fatcat_export.py index a9d46142..e3c141fd 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -11,18 +11,11 @@ import sys import json import argparse -from citeproc import CitationStylesStyle, CitationStylesBibliography -from citeproc import Citation, CitationItem -from citeproc import formatter -from citeproc.source.json import CiteProcJSON -from citeproc_styles import get_style_filepath - import fatcat_client from fatcat_client.rest import ApiException from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ - release_to_elasticsearch, container_to_elasticsearch, \ - changelog_to_elasticsearch, public_api, release_to_csl + public_api def run_export_releases(args): @@ -32,70 +25,6 @@ def run_export_releases(args): args.json_output.write( json.dumps(entity_to_dict(release), api_client=args.api.api_client) + "\n") -def run_transform_releases(args): - for line in args.json_input: - line = line.strip() - if not line: - continue - entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) - args.json_output.write( - json.dumps(release_to_elasticsearch(entity)) + '\n') - -def run_transform_containers(args): - for line in args.json_input: - line = line.strip() - if not line: - continue - entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client) - args.json_output.write( - json.dumps(container_to_elasticsearch(entity)) + '\n') - -def run_transform_changelogs(args): - for line in args.json_input: - line = line.strip() - if not line: - continue - entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client) - args.json_output.write( - json.dumps(changelog_to_elasticsearch(entity)) + '\n') - -def run_citeproc_releases(args): - for line in args.json_input: - line = line.strip() - if not line: - continue - entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) - csl_json = release_to_csl(entity) - # XXX: - csl_json['id'] = "release:" + (entity.ident or "unknown") - if args.style == "csl-json": - args.json_output.write(json.dumps(csl_json) + "\n") - continue - bib_src = CiteProcJSON([csl_json]) - form = formatter.plain - if args.html: - form = formatter.html - style_path = get_style_filepath(args.style) - bib_style = CitationStylesStyle(style_path, validate=False) - bib = CitationStylesBibliography(bib_style, bib_src, form) - bib.register(Citation([CitationItem(csl_json['id'])])) - # XXX: - #args.json_output.write( - # json.dumps(release_to_csl(entity)) + '\n') - lines = bib.bibliography()[0] - if args.style == "bibtex": - for l in lines: - if l.startswith(" @"): - args.json_output.write("\n@") - elif l.startswith(" "): - #print("line: START|{}|END".format(l)) - args.json_output.write("\n " + l) - else: - args.json_output.write(l) - else: - args.json_output.write(''.join(lines) + "\n") - print() - def run_export_changelog(args): end = args.end if end is None: @@ -126,48 +55,6 @@ def main(): help="where to send output", default=sys.stdout, type=argparse.FileType('w')) - sub_transform_releases = subparsers.add_parser('transform-releases') - sub_transform_releases.set_defaults(func=run_transform_releases) - sub_transform_releases.add_argument('json_input', - help="JSON-per-line of release entities", - default=sys.stdin, type=argparse.FileType('r')) - sub_transform_releases.add_argument('json_output', - help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) - - sub_transform_containers = subparsers.add_parser('transform-containers') - sub_transform_containers.set_defaults(func=run_transform_containers) - sub_transform_containers.add_argument('json_input', - help="JSON-per-line of container entities", - default=sys.stdin, type=argparse.FileType('r')) - sub_transform_containers.add_argument('json_output', - help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) - - sub_transform_changelogs = subparsers.add_parser('transform-changelogs') - sub_transform_changelogs.set_defaults(func=run_transform_changelogs) - sub_transform_changelogs.add_argument('json_input', - help="JSON-per-line of changelog entries", - default=sys.stdin, type=argparse.FileType('r')) - sub_transform_changelogs.add_argument('json_output', - help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) - - sub_citeproc_releases = subparsers.add_parser('citeproc-releases') - sub_citeproc_releases.set_defaults(func=run_citeproc_releases) - sub_citeproc_releases.add_argument('json_input', - help="JSON-per-line of release entities", - default=sys.stdin, type=argparse.FileType('r')) - sub_citeproc_releases.add_argument('json_output', - help="where to send output", - default=sys.stdout, type=argparse.FileType('w')) - sub_citeproc_releases.add_argument('--style', - help="citation style to output", - default='csl-json') - sub_citeproc_releases.add_argument('--html', - action='store_true', - help="output HTML, not plain text") - sub_changelog = subparsers.add_parser('changelog') sub_changelog.set_defaults(func=run_export_changelog) sub_changelog.add_argument('--start', diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index c72ccd47..f2798f0b 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -1,6 +1,4 @@ from .api_auth import authenticated_api, public_api from .fcid import fcid2uuid, uuid2fcid -from .transforms import entity_to_dict, entity_from_json, \ - release_to_elasticsearch, container_to_elasticsearch, \ - changelog_to_elasticsearch, release_to_csl +from .transforms import * diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py new file mode 100644 index 00000000..4950433b --- /dev/null +++ b/python/fatcat_tools/transforms/__init__.py @@ -0,0 +1,4 @@ + +from .entities import entity_to_dict, entity_from_json +from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch +from .csl import release_to_csl diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py new file mode 100644 index 00000000..f9615b26 --- /dev/null +++ b/python/fatcat_tools/transforms/csl.py @@ -0,0 +1,170 @@ + + +import collections +from fatcat_client import ApiClient + + +def contribs_by_role(contribs, role): + ret = [c.copy() for c in contribs if c['role'] == role] + [c.pop('role') for c in ret] + # XXX: + [c.pop('literal') for c in ret] + if not ret: + return None + else: + return ret + + +def release_to_csl(entity): + """ + Returns a python dict which can be json.dumps() to get a CSL-JSON (aka, + citeproc-JSON, aka Citation Style Language JSON) + + This function will likely become an API method/endpoint + + Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json + """ + contribs = [] + for contrib in (entity.contribs or []): + if contrib.creator: + # TODO: should we actually be pulling creator metadata? or just + # using release-local raw metadata? + c = dict( + family=contrib.creator.surname, + given=contrib.creator.given_name, + #dropping-particle + #non-dropping-particle + #suffix + #comma-suffix + #static-ordering + literal=contrib.raw_name, # or display_name? + #parse-names, + role=contrib.role, + ) + else: + c = dict( + # XXX: possible inclusion of full name metadata in release_contrib + family=contrib.raw_name.split()[-1], + literal=contrib.raw_name, + role=contrib.role, + ) + for k in list(c.keys()): + if not c[k]: + c.pop(k) + contribs.append(c) + abstract = None + if entity.abstracts: + abstract = entity.abstracts[0].content + + issued_date = None + if entity.release_date: + issued_date = {"date-parts": [[ + entity.release_date.year, + entity.release_date.month, + entity.release_date.day, + ]]} + elif entity.release_year: + issued_date = {"date-parts": [[entity.release_year]]} + + csl = dict( + #id, + #categories + type=entity.release_type or "article", # XXX: can't be blank + language=entity.language, + #journalAbbreviation + #shortTitle + ## see below for all contrib roles + #accessed + #container + #event-date + issued=issued_date, + #original-date + #submitted + abstract=abstract, + #annote + #archive + #archive_location + #archive-place + #authority + #call-number + #chapter-number + #citation-number + #citation-label + #collection-number + #collection-title + container_title=entity.container and entity.container.name, + #container-title-short + #dimensions + DOI=entity.doi, + #edition + #event + #event-place + #first-reference-note-number + #genre + ISBN=entity.isbn13, + ISSN=entity.container and entity.container.issnl, + issue=entity.issue, + #jurisdiction + #keyword + #locator + #medium + #note + #number + #number-of-pages + #number-of-volumes + #original-publisher + #original-publisher-place + #original-title + # XXX: page=entity.pages, + page_first=entity.pages.split('-')[0], + PMCID=entity.pmcid, + PMID=entity.pmid, + publisher=(entity.container and entity.container.publisher) or entity.publisher, + #publisher-place + #references + #reviewed-title + #scale + #section + #source + #status + title=entity.title, + #title-short + #URL + #version + volume=entity.volume, + #year-suffix + ) + for role in ['author', 'collection-editor', 'composer', 'container-author', + 'director', 'editor', 'editorial-director', 'interviewer', + 'illustrator', 'original-author', 'recipient', 'reviewed-author', + 'translator']: + cbr = contribs_by_role(contribs, role) + if cbr: + csl[role] = cbr + # underline-to-dash + csl['container-title'] = csl.pop('container_title') + csl['page-first'] = csl.pop('page_first') + empty_keys = [k for k,v in csl.items() if not v] + for k in empty_keys: + csl.pop(k) + return csl + + +def refs_to_csl(entity): + ret = [] + for ref in entity.refs: + if ref.release_id and False: + # TODO: fetch full entity from API and convert with release_to_csl + raise NotImplementedError + else: + issued_date = None + if ref.year: + issued_date = [[ref.year]] + csl = dict( + title=ref.title, + issued=issued_date, + ) + csl['id'] = ref.key or ref.index, # zero- or one-indexed? + ret.append(csl) + return ret + diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms/elasticsearch.py index f49b5ac9..0c2c5e46 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -3,31 +3,6 @@ import collections from fatcat_client import ApiClient -def entity_to_dict(entity, api_client=None): - """ - Hack to take advantage of the code-generated serialization code. - - Initializing/destroying ApiClient objects is surprisingly expensive - (because it involves a threadpool), so we allow passing an existing - instance. If you already have a full-on API connection `api`, you can - access the ApiClient object as `api.api_client`. This is such a speed-up - that this argument may become mandatory. - """ - if not api_client: - api_client = ApiClient() - return api_client.sanitize_for_serialization(entity) - -def entity_from_json(json_str, entity_type, api_client=None): - """ - Hack to take advantage of the code-generated deserialization code - - See not on `entity_to_dict()` about api_client argument. - """ - if not api_client: - api_client = ApiClient() - thing = collections.namedtuple('Thing', ['data']) - thing.data = json_str - return api_client.deserialize(thing, entity_type) def check_kbart(year, archive): if not archive or not archive.get('year_spans'): @@ -319,171 +294,6 @@ def container_to_elasticsearch(entity, force_bool=True): return t -def contribs_by_role(contribs, role): - ret = [c.copy() for c in contribs if c['role'] == role] - [c.pop('role') for c in ret] - # XXX: - [c.pop('literal') for c in ret] - if not ret: - return None - else: - return ret - - -def release_to_csl(entity): - """ - Returns a python dict which can be json.dumps() to get a CSL-JSON (aka, - citeproc-JSON, aka Citation Style Language JSON) - - This function will likely become an API method/endpoint - - Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json - """ - contribs = [] - for contrib in (entity.contribs or []): - if contrib.creator: - # TODO: should we actually be pulling creator metadata? or just - # using release-local raw metadata? - c = dict( - family=contrib.creator.surname, - given=contrib.creator.given_name, - #dropping-particle - #non-dropping-particle - #suffix - #comma-suffix - #static-ordering - literal=contrib.raw_name, # or display_name? - #parse-names, - role=contrib.role, - ) - else: - c = dict( - # XXX: possible inclusion of full name metadata in release_contrib - family=contrib.raw_name.split()[-1], - literal=contrib.raw_name, - role=contrib.role, - ) - for k in list(c.keys()): - if not c[k]: - c.pop(k) - contribs.append(c) - abstract = None - if entity.abstracts: - abstract = entity.abstracts[0].content - - issued_date = None - if entity.release_date: - issued_date = {"date-parts": [[ - entity.release_date.year, - entity.release_date.month, - entity.release_date.day, - ]]} - elif entity.release_year: - issued_date = {"date-parts": [[entity.release_year]]} - - csl = dict( - #id, - #categories - type=entity.release_type or "article", # XXX: can't be blank - language=entity.language, - #journalAbbreviation - #shortTitle - ## see below for all contrib roles - #accessed - #container - #event-date - issued=issued_date, - #original-date - #submitted - abstract=abstract, - #annote - #archive - #archive_location - #archive-place - #authority - #call-number - #chapter-number - #citation-number - #citation-label - #collection-number - #collection-title - container_title=entity.container and entity.container.name, - #container-title-short - #dimensions - DOI=entity.doi, - #edition - #event - #event-place - #first-reference-note-number - #genre - ISBN=entity.isbn13, - ISSN=entity.container and entity.container.issnl, - issue=entity.issue, - #jurisdiction - #keyword - #locator - #medium - #note - #number - #number-of-pages - #number-of-volumes - #original-publisher - #original-publisher-place - #original-title - # XXX: page=entity.pages, - page_first=entity.pages.split('-')[0], - PMCID=entity.pmcid, - PMID=entity.pmid, - publisher=(entity.container and entity.container.publisher) or entity.publisher, - #publisher-place - #references - #reviewed-title - #scale - #section - #source - #status - title=entity.title, - #title-short - #URL - #version - volume=entity.volume, - #year-suffix - ) - for role in ['author', 'collection-editor', 'composer', 'container-author', - 'director', 'editor', 'editorial-director', 'interviewer', - 'illustrator', 'original-author', 'recipient', 'reviewed-author', - 'translator']: - cbr = contribs_by_role(contribs, role) - if cbr: - csl[role] = cbr - # underline-to-dash - csl['container-title'] = csl.pop('container_title') - csl['page-first'] = csl.pop('page_first') - empty_keys = [k for k,v in csl.items() if not v] - for k in empty_keys: - csl.pop(k) - return csl - - -def refs_to_csl(entity): - ret = [] - for ref in entity.refs: - if ref.release_id and False: - # TODO: fetch full entity from API and convert with release_to_csl - raise NotImplementedError - else: - issued_date = None - if ref.year: - issued_date = [[ref.year]] - csl = dict( - title=ref.title, - issued=issued_date, - ) - csl['id'] = ref.key or ref.index, # zero- or one-indexed? - ret.append(csl) - return ret - - def changelog_to_elasticsearch(entity): editgroup = entity.editgroup diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py new file mode 100644 index 00000000..b67df12d --- /dev/null +++ b/python/fatcat_tools/transforms/entities.py @@ -0,0 +1,31 @@ + + +import collections +from fatcat_client import ApiClient + +def entity_to_dict(entity, api_client=None): + """ + Hack to take advantage of the code-generated serialization code. + + Initializing/destroying ApiClient objects is surprisingly expensive + (because it involves a threadpool), so we allow passing an existing + instance. If you already have a full-on API connection `api`, you can + access the ApiClient object as `api.api_client`. This is such a speed-up + that this argument may become mandatory. + """ + if not api_client: + api_client = ApiClient() + return api_client.sanitize_for_serialization(entity) + +def entity_from_json(json_str, entity_type, api_client=None): + """ + Hack to take advantage of the code-generated deserialization code + + See not on `entity_to_dict()` about api_client argument. + """ + if not api_client: + api_client = ApiClient() + thing = collections.namedtuple('Thing', ['data']) + thing.data = json_str + return api_client.deserialize(thing, entity_type) + diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py new file mode 100755 index 00000000..8d5c34c5 --- /dev/null +++ b/python/fatcat_transform.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 + +""" +""" + +import sys +import json +import argparse + +from citeproc import CitationStylesStyle, CitationStylesBibliography +from citeproc import Citation, CitationItem +from citeproc import formatter +from citeproc.source.json import CiteProcJSON +from citeproc_styles import get_style_filepath + +import fatcat_client +from fatcat_client.rest import ApiException +from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry +from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ + release_to_elasticsearch, container_to_elasticsearch, \ + changelog_to_elasticsearch, public_api, release_to_csl + + +def run_transform_releases(args): + for line in args.json_input: + line = line.strip() + if not line: + continue + entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) + args.json_output.write( + json.dumps(release_to_elasticsearch(entity)) + '\n') + +def run_transform_containers(args): + for line in args.json_input: + line = line.strip() + if not line: + continue + entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client) + args.json_output.write( + json.dumps(container_to_elasticsearch(entity)) + '\n') + +def run_transform_changelogs(args): + for line in args.json_input: + line = line.strip() + if not line: + continue + entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client) + args.json_output.write( + json.dumps(changelog_to_elasticsearch(entity)) + '\n') + +def run_citeproc_releases(args): + for line in args.json_input: + line = line.strip() + if not line: + continue + entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) + csl_json = release_to_csl(entity) + # XXX: + csl_json['id'] = "release:" + (entity.ident or "unknown") + if args.style == "csl-json": + args.json_output.write(json.dumps(csl_json) + "\n") + continue + bib_src = CiteProcJSON([csl_json]) + form = formatter.plain + if args.html: + form = formatter.html + style_path = get_style_filepath(args.style) + bib_style = CitationStylesStyle(style_path, validate=False) + bib = CitationStylesBibliography(bib_style, bib_src, form) + bib.register(Citation([CitationItem(csl_json['id'])])) + # XXX: + #args.json_output.write( + # json.dumps(release_to_csl(entity)) + '\n') + lines = bib.bibliography()[0] + if args.style == "bibtex": + for l in lines: + if l.startswith(" @"): + args.json_output.write("\n@") + elif l.startswith(" "): + #print("line: START|{}|END".format(l)) + args.json_output.write("\n " + l) + else: + args.json_output.write(l) + else: + args.json_output.write(''.join(lines) + "\n") + print() + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--debug', + action='store_true', + help="enable debugging interface") + parser.add_argument('--host-url', + default="http://localhost:9411/v0", + help="connect to this host/port") + subparsers = parser.add_subparsers() + + sub_transform_releases = subparsers.add_parser('transform-releases') + sub_transform_releases.set_defaults(func=run_transform_releases) + sub_transform_releases.add_argument('json_input', + help="JSON-per-line of release entities", + default=sys.stdin, type=argparse.FileType('r')) + sub_transform_releases.add_argument('json_output', + help="where to send output", + default=sys.stdout, type=argparse.FileType('w')) + + sub_transform_containers = subparsers.add_parser('transform-containers') + sub_transform_containers.set_defaults(func=run_transform_containers) + sub_transform_containers.add_argument('json_input', + help="JSON-per-line of container entities", + default=sys.stdin, type=argparse.FileType('r')) + sub_transform_containers.add_argument('json_output', + help="where to send output", + default=sys.stdout, type=argparse.FileType('w')) + + sub_transform_changelogs = subparsers.add_parser('transform-changelogs') + sub_transform_changelogs.set_defaults(func=run_transform_changelogs) + sub_transform_changelogs.add_argument('json_input', + help="JSON-per-line of changelog entries", + default=sys.stdin, type=argparse.FileType('r')) + sub_transform_changelogs.add_argument('json_output', + help="where to send output", + default=sys.stdout, type=argparse.FileType('w')) + + sub_citeproc_releases = subparsers.add_parser('citeproc-releases') + sub_citeproc_releases.set_defaults(func=run_citeproc_releases) + sub_citeproc_releases.add_argument('json_input', + help="JSON-per-line of release entities", + default=sys.stdin, type=argparse.FileType('r')) + sub_citeproc_releases.add_argument('json_output', + help="where to send output", + default=sys.stdout, type=argparse.FileType('w')) + sub_citeproc_releases.add_argument('--style', + help="citation style to output", + default='csl-json') + sub_citeproc_releases.add_argument('--html', + action='store_true', + help="output HTML, not plain text") + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!") + sys.exit(-1) + + args.api = public_api(args.host_url) + args.func(args) + +if __name__ == '__main__': + main() |