aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xpython/fatcat_export.py115
-rw-r--r--python/fatcat_tools/__init__.py4
-rw-r--r--python/fatcat_tools/transforms/__init__.py4
-rw-r--r--python/fatcat_tools/transforms/csl.py170
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py (renamed from python/fatcat_tools/transforms.py)190
-rw-r--r--python/fatcat_tools/transforms/entities.py31
-rwxr-xr-xpython/fatcat_transform.py149
7 files changed, 356 insertions, 307 deletions
diff --git a/python/fatcat_export.py b/python/fatcat_export.py
index a9d46142..e3c141fd 100755
--- a/python/fatcat_export.py
+++ b/python/fatcat_export.py
@@ -11,18 +11,11 @@ import sys
import json
import argparse
-from citeproc import CitationStylesStyle, CitationStylesBibliography
-from citeproc import Citation, CitationItem
-from citeproc import formatter
-from citeproc.source.json import CiteProcJSON
-from citeproc_styles import get_style_filepath
-
import fatcat_client
from fatcat_client.rest import ApiException
from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry
from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \
- release_to_elasticsearch, container_to_elasticsearch, \
- changelog_to_elasticsearch, public_api, release_to_csl
+ public_api
def run_export_releases(args):
@@ -32,70 +25,6 @@ def run_export_releases(args):
args.json_output.write(
json.dumps(entity_to_dict(release), api_client=args.api.api_client) + "\n")
-def run_transform_releases(args):
- for line in args.json_input:
- line = line.strip()
- if not line:
- continue
- entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
- args.json_output.write(
- json.dumps(release_to_elasticsearch(entity)) + '\n')
-
-def run_transform_containers(args):
- for line in args.json_input:
- line = line.strip()
- if not line:
- continue
- entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client)
- args.json_output.write(
- json.dumps(container_to_elasticsearch(entity)) + '\n')
-
-def run_transform_changelogs(args):
- for line in args.json_input:
- line = line.strip()
- if not line:
- continue
- entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client)
- args.json_output.write(
- json.dumps(changelog_to_elasticsearch(entity)) + '\n')
-
-def run_citeproc_releases(args):
- for line in args.json_input:
- line = line.strip()
- if not line:
- continue
- entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
- csl_json = release_to_csl(entity)
- # XXX:
- csl_json['id'] = "release:" + (entity.ident or "unknown")
- if args.style == "csl-json":
- args.json_output.write(json.dumps(csl_json) + "\n")
- continue
- bib_src = CiteProcJSON([csl_json])
- form = formatter.plain
- if args.html:
- form = formatter.html
- style_path = get_style_filepath(args.style)
- bib_style = CitationStylesStyle(style_path, validate=False)
- bib = CitationStylesBibliography(bib_style, bib_src, form)
- bib.register(Citation([CitationItem(csl_json['id'])]))
- # XXX:
- #args.json_output.write(
- # json.dumps(release_to_csl(entity)) + '\n')
- lines = bib.bibliography()[0]
- if args.style == "bibtex":
- for l in lines:
- if l.startswith(" @"):
- args.json_output.write("\n@")
- elif l.startswith(" "):
- #print("line: START|{}|END".format(l))
- args.json_output.write("\n " + l)
- else:
- args.json_output.write(l)
- else:
- args.json_output.write(''.join(lines) + "\n")
- print()
-
def run_export_changelog(args):
end = args.end
if end is None:
@@ -126,48 +55,6 @@ def main():
help="where to send output",
default=sys.stdout, type=argparse.FileType('w'))
- sub_transform_releases = subparsers.add_parser('transform-releases')
- sub_transform_releases.set_defaults(func=run_transform_releases)
- sub_transform_releases.add_argument('json_input',
- help="JSON-per-line of release entities",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_transform_releases.add_argument('json_output',
- help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
-
- sub_transform_containers = subparsers.add_parser('transform-containers')
- sub_transform_containers.set_defaults(func=run_transform_containers)
- sub_transform_containers.add_argument('json_input',
- help="JSON-per-line of container entities",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_transform_containers.add_argument('json_output',
- help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
-
- sub_transform_changelogs = subparsers.add_parser('transform-changelogs')
- sub_transform_changelogs.set_defaults(func=run_transform_changelogs)
- sub_transform_changelogs.add_argument('json_input',
- help="JSON-per-line of changelog entries",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_transform_changelogs.add_argument('json_output',
- help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
-
- sub_citeproc_releases = subparsers.add_parser('citeproc-releases')
- sub_citeproc_releases.set_defaults(func=run_citeproc_releases)
- sub_citeproc_releases.add_argument('json_input',
- help="JSON-per-line of release entities",
- default=sys.stdin, type=argparse.FileType('r'))
- sub_citeproc_releases.add_argument('json_output',
- help="where to send output",
- default=sys.stdout, type=argparse.FileType('w'))
- sub_citeproc_releases.add_argument('--style',
- help="citation style to output",
- default='csl-json')
- sub_citeproc_releases.add_argument('--html',
- action='store_true',
- help="output HTML, not plain text")
-
sub_changelog = subparsers.add_parser('changelog')
sub_changelog.set_defaults(func=run_export_changelog)
sub_changelog.add_argument('--start',
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
index c72ccd47..f2798f0b 100644
--- a/python/fatcat_tools/__init__.py
+++ b/python/fatcat_tools/__init__.py
@@ -1,6 +1,4 @@
from .api_auth import authenticated_api, public_api
from .fcid import fcid2uuid, uuid2fcid
-from .transforms import entity_to_dict, entity_from_json, \
- release_to_elasticsearch, container_to_elasticsearch, \
- changelog_to_elasticsearch, release_to_csl
+from .transforms import *
diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py
new file mode 100644
index 00000000..4950433b
--- /dev/null
+++ b/python/fatcat_tools/transforms/__init__.py
@@ -0,0 +1,4 @@
+
+from .entities import entity_to_dict, entity_from_json
+from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch
+from .csl import release_to_csl
diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py
new file mode 100644
index 00000000..f9615b26
--- /dev/null
+++ b/python/fatcat_tools/transforms/csl.py
@@ -0,0 +1,170 @@
+
+
+import collections
+from fatcat_client import ApiClient
+
+
+def contribs_by_role(contribs, role):
+ ret = [c.copy() for c in contribs if c['role'] == role]
+ [c.pop('role') for c in ret]
+ # XXX:
+ [c.pop('literal') for c in ret]
+ if not ret:
+ return None
+ else:
+ return ret
+
+
+def release_to_csl(entity):
+ """
+ Returns a python dict which can be json.dumps() to get a CSL-JSON (aka,
+ citeproc-JSON, aka Citation Style Language JSON)
+
+ This function will likely become an API method/endpoint
+
+ Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json
+ """
+ contribs = []
+ for contrib in (entity.contribs or []):
+ if contrib.creator:
+ # TODO: should we actually be pulling creator metadata? or just
+ # using release-local raw metadata?
+ c = dict(
+ family=contrib.creator.surname,
+ given=contrib.creator.given_name,
+ #dropping-particle
+ #non-dropping-particle
+ #suffix
+ #comma-suffix
+ #static-ordering
+ literal=contrib.raw_name, # or display_name?
+ #parse-names,
+ role=contrib.role,
+ )
+ else:
+ c = dict(
+ # XXX: possible inclusion of full name metadata in release_contrib
+ family=contrib.raw_name.split()[-1],
+ literal=contrib.raw_name,
+ role=contrib.role,
+ )
+ for k in list(c.keys()):
+ if not c[k]:
+ c.pop(k)
+ contribs.append(c)
+ abstract = None
+ if entity.abstracts:
+ abstract = entity.abstracts[0].content
+
+ issued_date = None
+ if entity.release_date:
+ issued_date = {"date-parts": [[
+ entity.release_date.year,
+ entity.release_date.month,
+ entity.release_date.day,
+ ]]}
+ elif entity.release_year:
+ issued_date = {"date-parts": [[entity.release_year]]}
+
+ csl = dict(
+ #id,
+ #categories
+ type=entity.release_type or "article", # XXX: can't be blank
+ language=entity.language,
+ #journalAbbreviation
+ #shortTitle
+ ## see below for all contrib roles
+ #accessed
+ #container
+ #event-date
+ issued=issued_date,
+ #original-date
+ #submitted
+ abstract=abstract,
+ #annote
+ #archive
+ #archive_location
+ #archive-place
+ #authority
+ #call-number
+ #chapter-number
+ #citation-number
+ #citation-label
+ #collection-number
+ #collection-title
+ container_title=entity.container and entity.container.name,
+ #container-title-short
+ #dimensions
+ DOI=entity.doi,
+ #edition
+ #event
+ #event-place
+ #first-reference-note-number
+ #genre
+ ISBN=entity.isbn13,
+ ISSN=entity.container and entity.container.issnl,
+ issue=entity.issue,
+ #jurisdiction
+ #keyword
+ #locator
+ #medium
+ #note
+ #number
+ #number-of-pages
+ #number-of-volumes
+ #original-publisher
+ #original-publisher-place
+ #original-title
+ # XXX: page=entity.pages,
+ page_first=entity.pages.split('-')[0],
+ PMCID=entity.pmcid,
+ PMID=entity.pmid,
+ publisher=(entity.container and entity.container.publisher) or entity.publisher,
+ #publisher-place
+ #references
+ #reviewed-title
+ #scale
+ #section
+ #source
+ #status
+ title=entity.title,
+ #title-short
+ #URL
+ #version
+ volume=entity.volume,
+ #year-suffix
+ )
+ for role in ['author', 'collection-editor', 'composer', 'container-author',
+ 'director', 'editor', 'editorial-director', 'interviewer',
+ 'illustrator', 'original-author', 'recipient', 'reviewed-author',
+ 'translator']:
+ cbr = contribs_by_role(contribs, role)
+ if cbr:
+ csl[role] = cbr
+ # underline-to-dash
+ csl['container-title'] = csl.pop('container_title')
+ csl['page-first'] = csl.pop('page_first')
+ empty_keys = [k for k,v in csl.items() if not v]
+ for k in empty_keys:
+ csl.pop(k)
+ return csl
+
+
+def refs_to_csl(entity):
+ ret = []
+ for ref in entity.refs:
+ if ref.release_id and False:
+ # TODO: fetch full entity from API and convert with release_to_csl
+ raise NotImplementedError
+ else:
+ issued_date = None
+ if ref.year:
+ issued_date = [[ref.year]]
+ csl = dict(
+ title=ref.title,
+ issued=issued_date,
+ )
+ csl['id'] = ref.key or ref.index, # zero- or one-indexed?
+ ret.append(csl)
+ return ret
+
diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms/elasticsearch.py
index f49b5ac9..0c2c5e46 100644
--- a/python/fatcat_tools/transforms.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -3,31 +3,6 @@
import collections
from fatcat_client import ApiClient
-def entity_to_dict(entity, api_client=None):
- """
- Hack to take advantage of the code-generated serialization code.
-
- Initializing/destroying ApiClient objects is surprisingly expensive
- (because it involves a threadpool), so we allow passing an existing
- instance. If you already have a full-on API connection `api`, you can
- access the ApiClient object as `api.api_client`. This is such a speed-up
- that this argument may become mandatory.
- """
- if not api_client:
- api_client = ApiClient()
- return api_client.sanitize_for_serialization(entity)
-
-def entity_from_json(json_str, entity_type, api_client=None):
- """
- Hack to take advantage of the code-generated deserialization code
-
- See not on `entity_to_dict()` about api_client argument.
- """
- if not api_client:
- api_client = ApiClient()
- thing = collections.namedtuple('Thing', ['data'])
- thing.data = json_str
- return api_client.deserialize(thing, entity_type)
def check_kbart(year, archive):
if not archive or not archive.get('year_spans'):
@@ -319,171 +294,6 @@ def container_to_elasticsearch(entity, force_bool=True):
return t
-def contribs_by_role(contribs, role):
- ret = [c.copy() for c in contribs if c['role'] == role]
- [c.pop('role') for c in ret]
- # XXX:
- [c.pop('literal') for c in ret]
- if not ret:
- return None
- else:
- return ret
-
-
-def release_to_csl(entity):
- """
- Returns a python dict which can be json.dumps() to get a CSL-JSON (aka,
- citeproc-JSON, aka Citation Style Language JSON)
-
- This function will likely become an API method/endpoint
-
- Follows, but not enforced by: https://github.com/citation-style-language/schema/blob/master/csl-data.json
- """
- contribs = []
- for contrib in (entity.contribs or []):
- if contrib.creator:
- # TODO: should we actually be pulling creator metadata? or just
- # using release-local raw metadata?
- c = dict(
- family=contrib.creator.surname,
- given=contrib.creator.given_name,
- #dropping-particle
- #non-dropping-particle
- #suffix
- #comma-suffix
- #static-ordering
- literal=contrib.raw_name, # or display_name?
- #parse-names,
- role=contrib.role,
- )
- else:
- c = dict(
- # XXX: possible inclusion of full name metadata in release_contrib
- family=contrib.raw_name.split()[-1],
- literal=contrib.raw_name,
- role=contrib.role,
- )
- for k in list(c.keys()):
- if not c[k]:
- c.pop(k)
- contribs.append(c)
- abstract = None
- if entity.abstracts:
- abstract = entity.abstracts[0].content
-
- issued_date = None
- if entity.release_date:
- issued_date = {"date-parts": [[
- entity.release_date.year,
- entity.release_date.month,
- entity.release_date.day,
- ]]}
- elif entity.release_year:
- issued_date = {"date-parts": [[entity.release_year]]}
-
- csl = dict(
- #id,
- #categories
- type=entity.release_type or "article", # XXX: can't be blank
- language=entity.language,
- #journalAbbreviation
- #shortTitle
- ## see below for all contrib roles
- #accessed
- #container
- #event-date
- issued=issued_date,
- #original-date
- #submitted
- abstract=abstract,
- #annote
- #archive
- #archive_location
- #archive-place
- #authority
- #call-number
- #chapter-number
- #citation-number
- #citation-label
- #collection-number
- #collection-title
- container_title=entity.container and entity.container.name,
- #container-title-short
- #dimensions
- DOI=entity.doi,
- #edition
- #event
- #event-place
- #first-reference-note-number
- #genre
- ISBN=entity.isbn13,
- ISSN=entity.container and entity.container.issnl,
- issue=entity.issue,
- #jurisdiction
- #keyword
- #locator
- #medium
- #note
- #number
- #number-of-pages
- #number-of-volumes
- #original-publisher
- #original-publisher-place
- #original-title
- # XXX: page=entity.pages,
- page_first=entity.pages.split('-')[0],
- PMCID=entity.pmcid,
- PMID=entity.pmid,
- publisher=(entity.container and entity.container.publisher) or entity.publisher,
- #publisher-place
- #references
- #reviewed-title
- #scale
- #section
- #source
- #status
- title=entity.title,
- #title-short
- #URL
- #version
- volume=entity.volume,
- #year-suffix
- )
- for role in ['author', 'collection-editor', 'composer', 'container-author',
- 'director', 'editor', 'editorial-director', 'interviewer',
- 'illustrator', 'original-author', 'recipient', 'reviewed-author',
- 'translator']:
- cbr = contribs_by_role(contribs, role)
- if cbr:
- csl[role] = cbr
- # underline-to-dash
- csl['container-title'] = csl.pop('container_title')
- csl['page-first'] = csl.pop('page_first')
- empty_keys = [k for k,v in csl.items() if not v]
- for k in empty_keys:
- csl.pop(k)
- return csl
-
-
-def refs_to_csl(entity):
- ret = []
- for ref in entity.refs:
- if ref.release_id and False:
- # TODO: fetch full entity from API and convert with release_to_csl
- raise NotImplementedError
- else:
- issued_date = None
- if ref.year:
- issued_date = [[ref.year]]
- csl = dict(
- title=ref.title,
- issued=issued_date,
- )
- csl['id'] = ref.key or ref.index, # zero- or one-indexed?
- ret.append(csl)
- return ret
-
-
def changelog_to_elasticsearch(entity):
editgroup = entity.editgroup
diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py
new file mode 100644
index 00000000..b67df12d
--- /dev/null
+++ b/python/fatcat_tools/transforms/entities.py
@@ -0,0 +1,31 @@
+
+
+import collections
+from fatcat_client import ApiClient
+
+def entity_to_dict(entity, api_client=None):
+ """
+ Hack to take advantage of the code-generated serialization code.
+
+ Initializing/destroying ApiClient objects is surprisingly expensive
+ (because it involves a threadpool), so we allow passing an existing
+ instance. If you already have a full-on API connection `api`, you can
+ access the ApiClient object as `api.api_client`. This is such a speed-up
+ that this argument may become mandatory.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ return api_client.sanitize_for_serialization(entity)
+
+def entity_from_json(json_str, entity_type, api_client=None):
+ """
+ Hack to take advantage of the code-generated deserialization code
+
+ See not on `entity_to_dict()` about api_client argument.
+ """
+ if not api_client:
+ api_client = ApiClient()
+ thing = collections.namedtuple('Thing', ['data'])
+ thing.data = json_str
+ return api_client.deserialize(thing, entity_type)
+
diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py
new file mode 100755
index 00000000..8d5c34c5
--- /dev/null
+++ b/python/fatcat_transform.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+
+"""
+"""
+
+import sys
+import json
+import argparse
+
+from citeproc import CitationStylesStyle, CitationStylesBibliography
+from citeproc import Citation, CitationItem
+from citeproc import formatter
+from citeproc.source.json import CiteProcJSON
+from citeproc_styles import get_style_filepath
+
+import fatcat_client
+from fatcat_client.rest import ApiException
+from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry
+from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \
+ release_to_elasticsearch, container_to_elasticsearch, \
+ changelog_to_elasticsearch, public_api, release_to_csl
+
+
+def run_transform_releases(args):
+ for line in args.json_input:
+ line = line.strip()
+ if not line:
+ continue
+ entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
+ args.json_output.write(
+ json.dumps(release_to_elasticsearch(entity)) + '\n')
+
+def run_transform_containers(args):
+ for line in args.json_input:
+ line = line.strip()
+ if not line:
+ continue
+ entity = entity_from_json(line, ContainerEntity, api_client=args.api.api_client)
+ args.json_output.write(
+ json.dumps(container_to_elasticsearch(entity)) + '\n')
+
+def run_transform_changelogs(args):
+ for line in args.json_input:
+ line = line.strip()
+ if not line:
+ continue
+ entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client)
+ args.json_output.write(
+ json.dumps(changelog_to_elasticsearch(entity)) + '\n')
+
+def run_citeproc_releases(args):
+ for line in args.json_input:
+ line = line.strip()
+ if not line:
+ continue
+ entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client)
+ csl_json = release_to_csl(entity)
+ # XXX:
+ csl_json['id'] = "release:" + (entity.ident or "unknown")
+ if args.style == "csl-json":
+ args.json_output.write(json.dumps(csl_json) + "\n")
+ continue
+ bib_src = CiteProcJSON([csl_json])
+ form = formatter.plain
+ if args.html:
+ form = formatter.html
+ style_path = get_style_filepath(args.style)
+ bib_style = CitationStylesStyle(style_path, validate=False)
+ bib = CitationStylesBibliography(bib_style, bib_src, form)
+ bib.register(Citation([CitationItem(csl_json['id'])]))
+ # XXX:
+ #args.json_output.write(
+ # json.dumps(release_to_csl(entity)) + '\n')
+ lines = bib.bibliography()[0]
+ if args.style == "bibtex":
+ for l in lines:
+ if l.startswith(" @"):
+ args.json_output.write("\n@")
+ elif l.startswith(" "):
+ #print("line: START|{}|END".format(l))
+ args.json_output.write("\n " + l)
+ else:
+ args.json_output.write(l)
+ else:
+ args.json_output.write(''.join(lines) + "\n")
+ print()
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--debug',
+ action='store_true',
+ help="enable debugging interface")
+ parser.add_argument('--host-url',
+ default="http://localhost:9411/v0",
+ help="connect to this host/port")
+ subparsers = parser.add_subparsers()
+
+ sub_transform_releases = subparsers.add_parser('transform-releases')
+ sub_transform_releases.set_defaults(func=run_transform_releases)
+ sub_transform_releases.add_argument('json_input',
+ help="JSON-per-line of release entities",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_transform_releases.add_argument('json_output',
+ help="where to send output",
+ default=sys.stdout, type=argparse.FileType('w'))
+
+ sub_transform_containers = subparsers.add_parser('transform-containers')
+ sub_transform_containers.set_defaults(func=run_transform_containers)
+ sub_transform_containers.add_argument('json_input',
+ help="JSON-per-line of container entities",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_transform_containers.add_argument('json_output',
+ help="where to send output",
+ default=sys.stdout, type=argparse.FileType('w'))
+
+ sub_transform_changelogs = subparsers.add_parser('transform-changelogs')
+ sub_transform_changelogs.set_defaults(func=run_transform_changelogs)
+ sub_transform_changelogs.add_argument('json_input',
+ help="JSON-per-line of changelog entries",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_transform_changelogs.add_argument('json_output',
+ help="where to send output",
+ default=sys.stdout, type=argparse.FileType('w'))
+
+ sub_citeproc_releases = subparsers.add_parser('citeproc-releases')
+ sub_citeproc_releases.set_defaults(func=run_citeproc_releases)
+ sub_citeproc_releases.add_argument('json_input',
+ help="JSON-per-line of release entities",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_citeproc_releases.add_argument('json_output',
+ help="where to send output",
+ default=sys.stdout, type=argparse.FileType('w'))
+ sub_citeproc_releases.add_argument('--style',
+ help="citation style to output",
+ default='csl-json')
+ sub_citeproc_releases.add_argument('--html',
+ action='store_true',
+ help="output HTML, not plain text")
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do!")
+ sys.exit(-1)
+
+ args.api = public_api(args.host_url)
+ args.func(args)
+
+if __name__ == '__main__':
+ main()