From f10bcb49d17234dc52c8b67a7b7fd1796ab6f435 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 20 Sep 2018 12:40:12 -0700 Subject: work in progress on guide (mdbook) --- guide/.gitignore | 1 + guide/README.md | 8 ++++ guide/book.toml | 5 ++ guide/push_qa.sh | 3 ++ guide/src/SUMMARY.md | 14 ++++++ guide/src/bulk_exports.md | 1 + guide/src/cookbook.md | 37 +++++++++++++++ guide/src/data_model.md | 1 + guide/src/entity_fields.md | 1 + guide/src/http_api.md | 56 +++++++++++++++++++++++ guide/src/implementation.md | 1 + guide/src/overview.md | 3 ++ guide/src/policies.md | 102 +++++++++++++++++++++++++++++++++++++++++ guide/src/rfc.md | 1 + guide/src/sources.md | 1 + guide/src/style_guide.md | 109 ++++++++++++++++++++++++++++++++++++++++++++ guide/src/sw_contribute.md | 14 ++++++ 17 files changed, 358 insertions(+) create mode 100644 guide/.gitignore create mode 100644 guide/README.md create mode 100644 guide/book.toml create mode 100755 guide/push_qa.sh create mode 100644 guide/src/SUMMARY.md create mode 100644 guide/src/bulk_exports.md create mode 100644 guide/src/cookbook.md create mode 100644 guide/src/data_model.md create mode 100644 guide/src/entity_fields.md create mode 100644 guide/src/http_api.md create mode 100644 guide/src/implementation.md create mode 100644 guide/src/overview.md create mode 100644 guide/src/policies.md create mode 120000 guide/src/rfc.md create mode 100644 guide/src/sources.md create mode 100644 guide/src/style_guide.md create mode 100644 guide/src/sw_contribute.md diff --git a/guide/.gitignore b/guide/.gitignore new file mode 100644 index 00000000..7585238e --- /dev/null +++ b/guide/.gitignore @@ -0,0 +1 @@ +book diff --git a/guide/README.md b/guide/README.md new file mode 100644 index 00000000..a8916ffb --- /dev/null +++ b/guide/README.md @@ -0,0 +1,8 @@ + +This is an [mdBook](https://rust-lang-nursery.github.io/mdBook/index.html), +containing documentation for the fatcat bibliographic catalog, including: + +- contributor style guide (for bibliographic metadata) +- developer (API) documentation +- etc. + diff --git a/guide/book.toml b/guide/book.toml new file mode 100644 index 00000000..b738a619 --- /dev/null +++ b/guide/book.toml @@ -0,0 +1,5 @@ +[book] +title = "Fatcat: The Guide" +authors = ["Fatcat Documentation Contributors"] +multilingual = false +src = "src" diff --git a/guide/push_qa.sh b/guide/push_qa.sh new file mode 100755 index 00000000..ffdc41bb --- /dev/null +++ b/guide/push_qa.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +rsync -arv book/ fatcat-vm:/srv/fatcat/guide diff --git a/guide/src/SUMMARY.md b/guide/src/SUMMARY.md new file mode 100644 index 00000000..6a23f7fe --- /dev/null +++ b/guide/src/SUMMARY.md @@ -0,0 +1,14 @@ +# Outline + +- [Fatcat Overview](./overview.md) + - [Data Model](./data_model.md) + - [Sources](./sources.md) + - [Implementation](./implementation.md) + - [Original Design Document](./rfc.md) +- [Cataloging Style Guide](./style_guide.md) + - [Entity Field Reference](./entity_fields.md) +- [Public API](./http_api.md) + - [Bulk Exports](./bulk_exports.md) + - [Cookbook](./cookbook.md) +- [Software Contributions](./sw_contribute.md) +- [Policies](./policies.md) diff --git a/guide/src/bulk_exports.md b/guide/src/bulk_exports.md new file mode 100644 index 00000000..aaf236f2 --- /dev/null +++ b/guide/src/bulk_exports.md @@ -0,0 +1 @@ +# Bulk Exports diff --git a/guide/src/cookbook.md b/guide/src/cookbook.md new file mode 100644 index 00000000..74bffe59 --- /dev/null +++ b/guide/src/cookbook.md @@ -0,0 +1,37 @@ +# Cookbook + +### Updating an Existing Entity + +1. Fetch (GET) the existing entity +2. Create (POST) a new editgroup +3. Update (PUT) the entity, with the current revision number in the `prev` edit + field, and the editgroup id set +4. Submit (POST? TBD) the editgroup for review + +### Merging Duplicate Entities + +1. Fetch (GET) both entities +2. Decide which will be the "primary" entity (the other will redirect to it) +3. Create (POST) a new editgroup +4. Update (PUT) the "primary" entity with any updated metadata merged from the + other entity (optional), and the editgroup id set +5. Update (PUT) the "other" entity with the redirect flag set to the primary's + identifier, with the current revision id (of the "other" entity) in the + `prev` field, and the editgroup id set +4. Submit (POST? TBD) the editgroup for review + +### Lookup Fulltext URLs by DOI + +1. Use release lookup endpoint (GET) with the DOI a query parameter, with + `expand=files` +2. If a release hit is found, iterate over the linked `file` entities, and + create a ranked list of URLs based on mimetype, URL "rel" type, file size, + or host domain. + +### Batch Insert New Entities (Bootstrapping) + +When bootstrapping a blank catalog, we need to insert 10s or 100s of millions +of entities as fast as possible. + +1. Create (POST) a new editgroup, with progeny information included +2. Batch create (POST) entities diff --git a/guide/src/data_model.md b/guide/src/data_model.md new file mode 100644 index 00000000..008c096a --- /dev/null +++ b/guide/src/data_model.md @@ -0,0 +1 @@ +# Data Model diff --git a/guide/src/entity_fields.md b/guide/src/entity_fields.md new file mode 100644 index 00000000..1a9e7bd4 --- /dev/null +++ b/guide/src/entity_fields.md @@ -0,0 +1 @@ +# Entity Field Reference diff --git a/guide/src/http_api.md b/guide/src/http_api.md new file mode 100644 index 00000000..5b38339f --- /dev/null +++ b/guide/src/http_api.md @@ -0,0 +1,56 @@ +# REST API + +The fatcat HTTP API is mostly a classic REST CRUD (Create, Read, Update, +Delete) API, with a few twists. + +A declarative specification of all API endpoints, JSON data models, and +response types is available in OpenAPI 2.0 format. Code generation tools are +used to generate both server-side type-safe endpoint routes and client-side +libraries. Auto-generated reference documentation is, for now, available at +. + +All API traffic is over HTTPS; there is no insecure HTTP endpoint, even for +read-only operations. To start, all endpoints accept and return only JSON +serialized content. + +## Editgroups + +All mutating entity operations (create, update, delete) accept an +`editgroup_id` query parameter. If the parameter isn't set, the editor's +"currently active" editgroup will be used, or a new editgroup will be created +from scratch. It's generally preferable to manually create an editgroup and use +the `id` in edit requests; the allows appropriate metadata to be set. The +"currently active" editgroup behavior may be removed in the future. + +## Sub-Entity Expansion + +To reduce the need for multiple GET queries when looking for common related +metadata, it is possible to include linked entities in responses using the +`expand` query parameter. For example, by default the `release` model only +includes an optional `container_id` field which points to a container entity. +If the `expand` parameter is set: + + https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=container + +Then the full container model will be included under the `container` field. +Multiple expand parameters can be passed, comma-separated. + +## Authentication and Authorization + +There are two editor types: bots and humans. Additionally, either type of +editor may have additional privileges which allow them to, eg, directly accept +editgroups (as opposed to submitting edits for review). + +All mutating API calls (POST, PUT, DELETE HTTP verbs) require token-based +authentication using an HTTP Bearer token. If you can't generate such a token +from the web interface (because that feature hasn't been implemented), look for +a public demo token for experimentation, or ask an administrator for a token. + +## QA Instance + +The intent is to run a public "sandbox" QA instance of the catalog, using a +subset of the full catalog, running the most recent development branch of the +API specification. This instance can be used by developers for prototyping and +experimentation, though note that all data is periodically wiped, and this +endpoint is more likely to have bugs or be offline. + diff --git a/guide/src/implementation.md b/guide/src/implementation.md new file mode 100644 index 00000000..d2557ff7 --- /dev/null +++ b/guide/src/implementation.md @@ -0,0 +1 @@ +# Implementation diff --git a/guide/src/overview.md b/guide/src/overview.md new file mode 100644 index 00000000..bc08ce1e --- /dev/null +++ b/guide/src/overview.md @@ -0,0 +1,3 @@ +# Fatcat Overview + +For now, see the [RFC](https://fatcat.wiki). diff --git a/guide/src/policies.md b/guide/src/policies.md new file mode 100644 index 00000000..18d84a36 --- /dev/null +++ b/guide/src/policies.md @@ -0,0 +1,102 @@ +# Norms and Policies + +These social norms are explicitly expected to evolve and mature if the number +of contributors to the project grows. It is important to have some policies as +a starting point, but also important not to set these policies in stone until +they have been reviewed. + +## Social Norms and Conduct + +Contributors (editors and software developers) are expected to treat each other +excellently, to assume good intentions, and to participate constructively. + +## Metadata Licensing + +The Fatcat catalog content license is the Creative Commons Zero ("CC-0") +license, which is effectively a public domain grant. This applies to the +catalog metadata itself (titles, entity relationships, citation metadata, URLs, +hashes, identifiers), as well as "meta-meta-data" provided by editors (edit +descriptions, progeny metadata, etc). + +The core catalog is designed to contain only factual information: "this work, +known by this title and with these third-party identifiers, is believed to be +represented by these files and published under such-and-such venue". As a norm, +sourcing metadata (for attribution and progeny) is retained for each edit made +to the catalog. + +A notable exception to this policy are abstracts, for which no copyright claims +or license is made. Abstract content is kept separate from core catalog +metadata; downstream users need to make their own decision regarding reuse and +distribution of this material. + +As a social norm, it is expected (and appreciated!) that downstream users of +the public API and/or bulk exports provide attribution, and even transitive +attribution (acknowledging the original source of metadata contributed to +Fatcat). As an academic norm, researchers are encouraged to cite the corpus as +a dataset (when this option becomes available). However, neither of these norms +are enforced via the copyright mechanism. + +As a strong norm, editors should expect full access to the full corpus and edit +history, including all of their contributions. + +## Immutable History + +All editors agree to the licensing terms, and understand that their full public +history of contributions is made irrevokably public. Edits and contributions +may be *reverted*, but the history (and content) of their edits are retained. +Edit history is not removed from the corpus on the request of an editor or when +an editor closes their account. + +In an emergency situation, such as non-bibliographic content getting encoded in +the corpus by bypassing normal filters (eg, base64 encoding hate crime content +or exploitive photos, as has happened to some blockchain projects), the +ecosystem may decide to collectively, in a coordinated manner, expunge specific +records from their history. + +## Documentation Licensing + +This guide ("Fatcat: The Guide") is licensed under the Creative Commons +Attribution license. + +## Software Licensing + +The Fatcat software project licensing policy is to adopt strong copyleft +licenses for server software (where the majority of software development takes +place), and permissive licenses for client library and bot framework software, +and CC-0 (public grant) licensing for declarative interface specifications +(such as SQL schemas and REST API specifications). + +## Privacy Policy + +*It is important to note that this section is currently aspirational: the +servers hosting early deployments of fatcat are largely in a default +configuration and have not been audited to ensure that these guidelines are +being followed.* + +It is a goal for fatcat to conduct as little surveillence of reader and editor +bahavior and activities as possible. In pratical terms, this means minimizing +the overall amount of logging and collection of identifying information. This +is in contrast to *submitted edit content*, which is captured, preserved, and +republished as widely as possible. + +The general intention is to: + +- not use third-party tracking (via extract browser-side requests or + javascript) +- collect aggregate *metrics* (overall hit numbers), but not *log* individual + interactions ("this IP visited this page at this time") + +Exceptions will likely be made: + +- temporary caching of IP addresses may be necessary to implement rate-limiting + and debug traffic spikes +- exception logging, abuse detection, and other exceptional + +Some uncertain areas of privacy include: + +- should third-party authenticion identities be linked to editor ids? what + about the specific case of ORCiDs if used for login? +- what about discussion and comments on edits? should conversations be included + in full history dumps? should editors be allowed to update or remove + comments? + diff --git a/guide/src/rfc.md b/guide/src/rfc.md new file mode 120000 index 00000000..25c420d2 --- /dev/null +++ b/guide/src/rfc.md @@ -0,0 +1 @@ +../../fatcat-rfc.md \ No newline at end of file diff --git a/guide/src/sources.md b/guide/src/sources.md new file mode 100644 index 00000000..e70306d4 --- /dev/null +++ b/guide/src/sources.md @@ -0,0 +1 @@ +# Sources diff --git a/guide/src/style_guide.md b/guide/src/style_guide.md new file mode 100644 index 00000000..944e68ce --- /dev/null +++ b/guide/src/style_guide.md @@ -0,0 +1,109 @@ +# Cataloging Style Guide + +## Language and Translation of Metadata + +The Fatcat data model does not include multiple titles or names for the same +entity, or even a "native"/"international" representation as seems common in +other bibliographic systems. This most notably applies to release titles, but +also to container and publisher names, and likely other fields. + +For now, editors must use their own judgement over whether to use the title of +the release listed in the work itself + +This is not to be confused with *translations* of entire works, which should be +treated as an entirely separate `release`. + +## Work/Release Distinction + +## External Identifiers + +"Fake identifiers", which are actually registered and used in examples and +documentation (such as DOI `10.5555/12345678`) are allowed (and the entity +should be tagged as a fake or example). Non-registered "identifier-like +strings", which are semantically valid but not registered, should not exist in +fatcat metadata in an identifier column. Invalid identifier strings can be +stored in "extra" metadata. Crossref has [blogged]() about this distinction. + +[blogged]: https://www.crossref.org/blog/doi-like-strings-and-fake-dois/ + +#### DOI + +All DOIs stored in an entity column should be registered (aka, should be +resolvable from `doi.org`). Invalid identifiers may be cleaned up or removed by +bots. + +DOIs should *always* be stored and transfered in lower-case form. Note that +there are almost no other constraints on DOIs (and handles in general): they +may have muliple forward slashes, whitespace, of arbitrary length, etc. +Crossref has a [number of examples]() of such "valid" but frustratingly +formatted strings. + +[number of examples]: https://www.crossref.org/blog/dois-unambiguously-and-persistently-identify-published-trustworthy-citable-online-scholarly-literature-right/ + +In the fatcat ontology, DOIs and release entities are one-to-one. + +It is the intention to automatically (via bot) create a fatcat release for +every Crossref-registered DOI from a whitelist of media types +("journal-article" etc, but not all), and it would be desirable to auto-create +entities for in-scope publications from all registrars. It is not the intention +to auto-create a release for every registered DOI. In particular, +"sub-component" DOIs (eg, for an individual figure or table from a publication) +aren't currently auto-created, but could be stored in "extra" metadata, or on a +case-by-case basis. + +## Human Names + +Representing names of human beings in databases is a fraught subject. For some +background reading, see: + +- [Falsehoods Programmers Believe About Names](https://www.kalzumeus.com/2010/06/17/falsehoods-programmers-believe-about-names/) (blog post) +- [Personal names around the world](https://www.w3.org/International/questions/qa-personal-names) (W3C informational) +- [Hubert Blaine Wolfeschlegelsteinhausenbergerdorff Sr.](https://en.wikipedia.org/wiki/Hubert_Blaine_Wolfeschlegelsteinhausenbergerdorff_Sr.) (Wikipedia article) + +Particular difficult issues in the context of a bibliographic database include +the non-universal concept of "family" vs. "given" names and their relationship +to first and last names; the inclusion of honarary titles and other suffixes +and prefixes to a name; the distinction between "prefered", "legal", and +"bibliographic" names, or other situations where a person may not wish to be +known under the name they are commonly refered to under; language and character +set issues; and pseudonyms, anonymous publications, and fake personas (perhaps +representing a group, like Bourbaki). + +The general guidance for Fatcat is to: + +- not be a "source of truth" for representing a persona or human being; ORCiD + and Wikidata are better suited to this task +- represent author personas, not necessarily 1-to-1 with human beings +- prioritize the concerns of a reader or researcher over that of the author +- enable basic interoperability with external databases, file formats, schemas, + and style gudies +- when possible, respect the wishes of individuals + +The data model for the `creator` entity has three name fields: + +- `surname` and `given_name`: needed for "aligning" with external databases, + and to export metadata to many standard formats +- `display_name`: the "prefered" representation for display of the entire name, + in the context of international attribution of authorship of a written work + +Names to not necessarily need to expressed in a Latin character set, but also +does not necessarily need to be in the native language of the creator or the +language of their notable works + +Ideally all three fields are populated for all creators. + +It seems likely that this schema and guidance will need review. "Extra" +metadata can be used to store aliases and alternative representations, which +may be useful for disambiguation and automated de-duplication. + +## Editgroups and Meta-Meta-Data + +Editors are expected to group their edits in semantically meaningful editgroups +of a reasonable size for review and acceptance. For example, merging two +`creators` and updating related `releases` could all go in a single editgroup. +Large refactors, conversions, and imports, which may touch thousands of +entities, should be grouped into reasonable size editgroups; extremely large +editgroups may cause technical issues, and make review unmanagable. 50 edits is +a decent batch size, and 100 is a good upper limit (and may be enforced by the +server). + diff --git a/guide/src/sw_contribute.md b/guide/src/sw_contribute.md new file mode 100644 index 00000000..17d72785 --- /dev/null +++ b/guide/src/sw_contribute.md @@ -0,0 +1,14 @@ +# Software Contributions + +For now, issues and patches can be filed at . + +To start, the back-end (fatcatd, in rust), web interface (fatcat-web, in +python), bots, and this guide are all versioned in the same git repository. + +See the `rust/README` and `rust/HACKING` documents for some common tasks and +gotchas when working with the rust backend. + +When considering making a non-trivial contribution, it can save review time and +duplicated work to post an issue with your intentions and plan. New code and +features will need to include unit tests before being merged, though we can +help with writing them. -- cgit v1.2.3