From 182413ad4946d715aabf67c396d688fbb5d1c0eb Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 20 Sep 2018 20:20:43 -0700
Subject: progress on guide

---
 guide/TODO                 |   9 ++
 guide/push_prod.sh         |   3 +
 guide/push_qa.sh           |   2 +-
 guide/src/SUMMARY.md       |   1 +
 guide/src/entity_fields.md | 302 +++++++++++++++++++++++++++++++++++++++++++++
 guide/src/goals.md         |  93 ++++++++++++++
 guide/src/overview.md      |  92 --------------
 guide/src/sources.md       |  28 +++++
 guide/src/style_guide.md   |  26 +++-
 9 files changed, 462 insertions(+), 94 deletions(-)
 create mode 100644 guide/TODO
 create mode 100755 guide/push_prod.sh
 create mode 100644 guide/src/goals.md

diff --git a/guide/TODO b/guide/TODO
new file mode 100644
index 00000000..e3f9f527
--- /dev/null
+++ b/guide/TODO
@@ -0,0 +1,9 @@
+- break up RFC into sub sections
+- better landing page
+- scope
+
+TODO
+- 
+
+DONE
+- policies
diff --git a/guide/push_prod.sh b/guide/push_prod.sh
new file mode 100755
index 00000000..c9ef5b1f
--- /dev/null
+++ b/guide/push_prod.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+rsync -arv book/ fatcat-prod-vm:/srv/fatcat/guide
diff --git a/guide/push_qa.sh b/guide/push_qa.sh
index ffdc41bb..8c6f68bd 100755
--- a/guide/push_qa.sh
+++ b/guide/push_qa.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-rsync -arv book/ fatcat-vm:/srv/fatcat/guide
+rsync -arv book/ fatcat-qa-vm:/srv/fatcat/guide
diff --git a/guide/src/SUMMARY.md b/guide/src/SUMMARY.md
index 736bb2cf..16f33ff1 100644
--- a/guide/src/SUMMARY.md
+++ b/guide/src/SUMMARY.md
@@ -1,6 +1,7 @@
 # Outline
 
 - [Fatcat Overview](./overview.md)
+    - [Goals and Related Projects](./goals.md)
     - [Data Model](./data_model.md)
     - [Workflow](./workflow.md)
     - [Sources](./sources.md)
diff --git a/guide/src/entity_fields.md b/guide/src/entity_fields.md
index 1a9e7bd4..0d0b2d6f 100644
--- a/guide/src/entity_fields.md
+++ b/guide/src/entity_fields.md
@@ -1 +1,303 @@
 # Entity Field Reference
+
+All entities have:
+
+- `extra`: free-form JSON metadata
+
+The "extra" field is an "escape hatch" to include extra fields not in the
+regular schema. It is intented to enable gradual evolution of the schema, as
+well as accomodating niche or field-specific content. That being said,
+reasonable limits should be adhered to.
+
+## Containers
+
+- `name`: (string, required). The title of the publication, as used in
+  international indexing services. Eg, "Journal of Important Results". Not
+  necessarily in the native language, but also not necessarily in English.
+  Alternative titles (and translations) can be stored in "extra" metadata
+  (TODO: what field?).
+- `publisher` (string): The name of the publishing organization. Eg, "Society
+  of Curious Students".
+- `issnl` (string): an external identifier, with registration controlled by the
+  [ISSN organization](http://www.issn.org/). Registration is relatively
+  inexpensive and easy to obtain (depending on world region), so almost all
+  serial publications have one. The ISSN-L ("linking ISSN") is one of either
+  the print ("ISSNp") or electronic ("ISSNe") identifiers for a serial
+  publication; not all publications have both types of ISSN, but many do, which
+  can cause confusion. The ISSN master list is not gratis/public, but the
+  ISSN-L mapping is.
+- `wikidata_qid` (string): external linking identifier to a Wikidata entity.
+- `abbrev` (string): a commonly used abbreviation for the publication, as used
+  in citations, following the [ISO 4]() standard. Eg, "Journal of Polymer
+  Science Part A" -> "J. Polym. Sci. A". Alternative abbreviations can be
+  stored in "extra" metadata. (TODO: what field?)
+- `coden` (string): an external identifier, the [CODEN code](). 6 characters,
+  all upper-case.
+
+[CODEN]: https://en.wikipedia.org/wiki/CODEN
+
+## Creators
+
+See ["Human Names"](./style_guide.index##human-names) sub-section of style
+guide.
+
+- `display_name` (string, required): Eg, "Grace Hopper".
+- `given_name` (string): Eg, "Grace".
+- `surname` (string): Eg, "Hooper".
+- `orcid` (string): external identifier, as registered with ORCID.
+- `wikidata_qid` (string): external linking identifier to a Wikidata entity.
+
+## Files
+
+- `size` (positive, non-zero integer): Eg: 1048576.
+- `sha1` (string): Eg: "f013d66c7f6817d08b7eb2a93e6d0440c1f3e7f8".
+- `md5`: Eg: "d41efcc592d1e40ac13905377399eb9b".
+- `sha256`: Eg: "a77e4c11a57f1d757fca5754a8f83b5d4ece49a2d28596889127c1a2f3f28832".
+- `urls`: An array of "typed" URLs. Order is not meaningful, and may not be
+  preserved.
+    - `url` (string, required):
+            Eg: "https://example.edu/~frau/prcding.pdf".
+    - `rel` (string, required):
+            Eg: "webarchive".
+- `mimetype` (string):
+    example: "application/pdf"
+- `releases` (array of identifiers): references to `release` entities that this
+  file represents a manifestation of. Note that a single file can contain
+  multiple release references (eg, a PDF containing a full issue with many
+  articles), and that a release will often have multiple files (differing only
+  by watermarks, or different digitizations of the same printed work, or
+  variant MIME/media types of the same published work). See also
+  "Work/Release/File Distinctions".
+
+## Releases
+
+- `title: (required)
+        type: string
+- `work_id:
+        type: string
+        example: "q3nouwy3nnbsvo3h5klxsx4a7y"
+- `container:
+        $ref: "#/definitions/container_entity"
+        description: "Optional; GET-only"
+- `files:
+        description: "Optional; GET-only"
+        type: array
+        items:
+          $ref: "#/definitions/file_entity"
+- `container_id:
+        type: string
+        example: "q3nouwy3nnbsvo3h5klxsx4a7y"
+- `release_type:
+        type: string
+        example: "book"
+- `release_status:
+        type: string
+        example: "preprint"
+- `release_date:
+        type: string
+        format: date
+- `doi:
+        type: string
+        #format: custom
+        example: "10.1234/abcde.789" See the "External Identifiers" section of style guide.
+- `isbn13` (string): external identifer for books. ISBN-9 and other formats
+  should be converted to canonical ISBN-13. See the "External Identifiers"
+  section of style guide.
+- `core_id` (string): external identifier for the [CORE] open access
+  aggregator. These identifiers are integers, but stored in string format. See
+  the "External Identifiers" section of style guide.
+- `pmid` (string): external identifier for PubMed database. These are bare
+  integers, but stored in a string format. See the "External Identifiers"
+  section of style guide.
+- `pmcid` (string): external identifier for PubMed Central database. These are
+  integers prefixed with "PMC" (upper case), like "PMC4321". See the "External
+  Identifiers" section of style guide.
+- `wikidata_qid` (string): external identifier for Wikidata entities. These are
+  integers prefixed with "Q", like "Q4321". Each `release` entity can be
+  associated with at most one Wikidata entity (this field is not an array), and
+  Wikidata entities should be associated with at most a single `release`. In
+  the future it may be possible to associate Wikidata entities with `work`
+  entities instead. See the "External Identifiers" section of style guide.
+- `volume` (string): optionally, stores the specific volume of a serial
+  publication this release was published in.
+        type: string
+- `issue` (string): optionally, stores the specific issue of a serial
+  publication this release was published in.
+- `pages` (string): the pages (within a volume/issue of a publication) that
+  this release can be looked up under. This is a free-form string, and could
+  represent the first page, a range of pages, or even prefix pages (like
+  "xii-xxx").
+- `publisher` (string): name of the publishing entity. This does not need to be
+  populated if the associated `container` entity has the publisher field set,
+  though it is acceptable to duplicate, as the publishing entity of a container
+  may differ over time. Should be set for singleton releases, like books.
+- `language` (string): the primary language used in this particular release of
+  the work. Only a single language can be specified; additional languages can
+  be stored in "extra" metadata (TODO: which field?). This field should be a
+  valid RFC1766/ISO639-1 language code ("with extensions"), aka a controlled
+  vocabulary, not a free-form name of the language.
+- `contribs`: an array of authorship and other `creator` contributions to this
+  release. Contribution fields include:
+    - `index` (integer, optional): the (zero-indexed) order of this
+      author. Authorship order has significance in many fields. Non-author
+      contributions (illustration, translation, editorship) may or may not be
+      ordered, depending on context, but index numbers should be unique per
+      release (aka, there should not be "first author" and "first translator")
+    - `creator_id` (identifier): if known, a reference to a specific `creator`
+    - `raw_name` (string): the name of the contributor, as attributed in the
+      text of this work. If the `creator_id` is linked, this may be different
+      from the `display_name`; if a creator is not linked, this field is
+      particularly important. Syntax and name order is not specified, but most
+      often will be "display order", not index/alphabetical (in Western
+      tradition, surname followed by given name).
+    - `role` (string, of a set): the type of contribution, from a controlled
+      vocabulary. TODO: vocabulary needs review.
+    - `extra` (string): additional context can go here. For example, author
+      affiliation, "this is the corresponding author", etc.
+- `refs`: an array of references (aka, citations) to other releases. References
+  can only be linked to a specific target release (not a work), though it may
+  be ambugious which release of a work is being referenced if the citation is
+  not specific enough. Reference fields include:
+    - index:
+        type: integer
+        format: int64
+    - target_release_id:
+        type: string
+        #format: ident
+    - extra:
+        type: object
+        additionalProperties: {}
+    - key:
+        type: string
+    - year:
+        type: integer
+        format: int64
+    - container_title:
+        type: string
+    - title:
+        type: string
+    - locator:
+        type: string
+        example: "p123"
+
+Controlled vocabulary for `release_type` is derived from the Crossref `type`
+vocabulary:
+
+- `journal-article`
+- `proceedings-article`
+- `monograph`
+- `dissertation`
+- `book` (and `edited-book`, `reference-book`)
+- `book-chapter` (and `book-part`, `book-section`, though much rarer) is
+  allowed as these are frequently referenced and read independent of the entire
+  book. The data model does not currently support linking a subset of a release
+  to an entity representing the entire release. The release/work/file
+  distinctions should not be used to group chapters into complete work; a book
+  chapter can be it's own work. A paper which is republished as a chapter (eg,
+  in a collection, or "edited" book) can have both releases under one work. The
+  criteria of whether to "split" a book and have release entities for each
+  chapter is whether the chapter has been cited/reference as such.
+- `dissertation`
+- `dataset` (though representation with `file` entities is TBD).
+- `monograph`
+- `report`
+- `standard`
+- `posted-content` is allowed, but may be re-categorized. For crossref, this
+  seems to imply a journal article or report which is not published (pre-print)
+- `other` matches Crossref `other` works, which may (and generally should) have
+  a more specific type set.
+- `web-post` (custom extension) for blog posts, essays, and other individual
+  works on websites
+- `website` (custom extension) for entire web sites and wikis.
+- `presentation` (custom extension) for, eg, slides and recorded conference
+  presentations themselves, as distinct from `proceedings-article`
+- `editorial` (custom extension) for columns, "in this issue", and other
+  content published along peer-reviewed content in journals. Can bleed in to
+  "other" or "stub"
+- `book-review` (custom extension)
+- `letter` for "letters to the editor", "authors respond", and
+  sub-article-length published content
+- `example` (custom extension) for dummy or example releases that have valid
+  (registered) identifiers. Other metadata does not need to match "canonical"
+  examples.
+- `stub` (custom extension) for releases which have notable external
+  identifiers, and thus are included "for completeness", but don't seem to
+  represent a "full work". An example might be a paper that gets an extra DOI
+  by accident; the primary DOI should be a full release, and the accidental DOI
+  can be a `stub` release under the same work. `stub` releases shouldn't be
+  considered full releases when counting or aggregating (though if technically
+  difficult this may not always be implemented). Other things that can be
+  categorized as stubs (which seem to often end up miscategorized as full
+  articles in bibliographic databases):
+    - an abstract, which is only an abstract of a larger work
+    - commercial advertisements
+    - "trap" or "honey pot" works, which are fakes included in databases to
+      detect re-publishing without attribution
+    - "This page is intentionally blank"
+    - "About the author", "About the editors", "About the cover"
+    - "Acknowledgements"
+    - "Notices"
+
+Other types from Crossref (such as `component`, `reference-entry`) are valid,
+but are not actively solicited for inclusion, as they are not the current focus
+of the database.
+
+In the future, some types (like `journal`, `proceedings`, and `book-series`)
+will probably be represented as `container` entities. How to represent other
+container-like types (like `report-series` or `book-series`) is TBD.
+
+Controlled vocabulary for `release_status`:
+- `published` for any version of the work that was "formally published", or any
+  variant that can be considered a "proof", "camera ready", "archival",
+  "version of record" or "definitive" that have no meaningful differences from
+  the "published" version. Note that "meaningful" here will need to be
+  explored.
+- `corrected` for a version of a work that, after formal publication, has been
+  revised and updated. Could be the "version of record".
+- `pre-print`, for versions of a work which have not been submitted for peer
+  review or formal publication
+- `post-print`, often a post-peer-review version of a work that does not have
+  publisher-supplied copy-editing, typesetting, etc.
+- `draft` in the context of book publication or online content (shouldn't be
+  applied to journal articles), is an unpublished, but somehow notable version
+  of a work.
+- If blank, indicates status isn't known, and wasn't inferred at creation time.
+  Can often be interpreted as `published`.
+
+Controlled vocabulary for `role` field on `contribs`:
+- `author`
+- `translator`
+- `illustrator`
+- `editor`
+- If blank, indicates that type of contribution is not known; this can often be
+  interpreted as authorship.
+
+Current "extra" fields, flags, and content:
+- `crossref` (object), for extra crossref-specific metadata
+- `is_retracted` (boolean flag) if this work has been retracted
+- `translation_of` (release identifier) if this release is a translation of
+  another (usually under the same work)
+- `arxiv_id` (string) external identifier to a (version-specific) [arxiv.org]()
+  work
+
+[arxiv.org]: https://arxiv.org
+
+abstracts:
+        type: array
+        items:
+          type: object
+          properties:
+            sha1:
+              type: string
+              example: "3f242a192acc258bdfdb151943419437f440c313"
+            content:
+              type: string
+              example: "<jats:p>Some abstract thing goes here</jats:p>"
+            mimetype:
+              type: string
+              example: "application/xml+jats"
+            lang:
+              type: string
+              example: "en"
+## Works
+
diff --git a/guide/src/goals.md b/guide/src/goals.md
new file mode 100644
index 00000000..80d0f145
--- /dev/null
+++ b/guide/src/goals.md
@@ -0,0 +1,93 @@
+# Goals and Related Projects
+
+## Goals and Ecosystem Niche
+
+For the Internet Archive use case, fatcat has two primary use cases:
+
+- Track the "completeness" of our holdings against all known published works.
+  In particular, allow us to monitor and prioritize further collection work.
+- Be a public-facing catalog and access mechanism for our open access holdings.
+
+In the larger ecosystem, fatcat could also provide:
+
+- A work-level (as opposed to title-level) archival dashboard: what fraction of
+  all published works are preserved in archives? KBART, CLOCKSS, Portico, and
+  other preservations don't provide granular metadata
+- A collaborative, independent, non-commercial, fully-open, field-agnostic,
+  "completeness"-oriented catalog of scholarly metadata
+- Unified (centralized) foundation for discovery and access across repositories
+  and archives: discovery projects can focus on user experience instead of
+  building their own catalog from scratch
+- Research corpus for meta-science, with an emphasis on availability and
+  reproducibility (metadata corpus itself is open access, and file-level hashes
+  control for content drift)
+- Foundational infrastructure for distributed digital preservation
+- On-ramp for non-traditional digital works ("grey literature") into the
+  scholarly web
+
+## Scope
+
+The goal is to capture the "scholarly web": the graph of written works that
+cite other works. Any work that is both cited more than once and cites more
+than one other work in the catalog is very likely to be in scope. "Leaf nodes"
+and small islands of intra-cited works may or may not be in scope.
+
+fatcat would not include any fulltext content itself, even for cleanly licensed
+(open access) works, but would have "strong" (verified) links to fulltext
+content, and would include file-level metadata (like hashes and fingerprints)
+to help discovery and identify content from any source. File-level URLs with
+context ("repository", "author-homepage", "web-archive") should make fatcat
+more useful for both humans and machines to quickly access fulltext content of
+a given mimetype than existing redirect or landing page systems. So another
+factor in deciding scope is whether a work has "digital fixity" and can be
+contained in a single immutable file.
+
+## References and Previous Work
+
+The closest overall analog of fatcat is [MusicBrainz][mb], a collaboratively
+edited music database. [Open Library][ol] is a very similar existing service,
+which exclusively contains book metadata.
+
+[Wikidata][wd] seems to be the most successful and actively edited/developed
+open bibliographic database at this time (early 2018), including the
+[wikicite][wikicite] conference and related Wikimedia/Wikipedia projects.
+Wikidata is a general purpose semantic database of entities, facts, and
+relationships; bibliographic metadata has become a large fraction of all
+content in recent years. The focus there seems to be linking knowledge
+(statements) to specific sources unambiguously. Potential advantages fatcat
+would have would be a focus on a specific scope (not a general-purpose database
+of entities) and a goal of completeness (capturing as many works and
+relationships as rapidly as possible). However, it might be better to just
+pitch in to the wikidata efforts.
+
+The technical design of fatcat is loosely inspired by the git
+branch/tag/commit/tree architecture, and specifically inspired by Oliver
+Charles' "New Edit System" [blog posts][nes-blog] from 2012.
+
+There are a whole bunch of proprietary, for-profit bibliographic databases,
+including Web of Science, Google Scholar, Microsoft Academic Graph, aminer,
+Scopus, and Dimensions. There are excellent field-limited databases like dblp,
+MEDLINE, and Semantic Scholar. There are some large general-purpose databases
+that are not directly user-editable, including the OpenCitation corpus, CORE,
+BASE, and CrossRef. I don't know of any large (more than 60 million works),
+open (bulk-downloadable with permissive or no license), field agnostic,
+user-editable corpus of scholarly publication bibliographic metadata.
+
+[nes-blog]: https://ocharles.org.uk/blog/posts/2012-07-10-nes-does-it-better-1.html
+[mb]: https://musicbrainz.org
+[ol]: https://openlibrary.org
+[wd]: https://wikidata.org
+[wikicite]: https://meta.wikimedia.org/wiki/WikiCite_2017
+
+## Further Reading
+
+"From ISIS to CouchDB: Databases and Data Models for Bibliographic Records" by Luciano G. Ramalho. code4lib, 2013. <https://journal.code4lib.org/articles/4893>
+
+"Representing bibliographic data in JSON". github README file, 2017. <https://github.com/rdmpage/bibliographic-metadata-json>
+
+"Citation Style Language", <https://citationstyles.org/>
+
+"Functional Requirements for Bibliographic Records", Wikipedia article, <https://en.wikipedia.org/wiki/Functional_Requirements_for_Bibliographic_Records>
+
+OpenCitations and I40C <http://opencitations.net/>, <https://i4oc.org/>
+
diff --git a/guide/src/overview.md b/guide/src/overview.md
index 8e6279ed..ef631b87 100644
--- a/guide/src/overview.md
+++ b/guide/src/overview.md
@@ -8,95 +8,3 @@ file-level metadata.
 
 fatcat is currently used internally at the Internet Archive, but interested
 folks are welcome to contribute to design and development.
-
-## Goals and Ecosystem Niche
-
-For the Internet Archive use case, fatcat has two primary use cases:
-
-- Track the "completeness" of our holdings against all known published works.
-  In particular, allow us to monitor and prioritize further collection work.
-- Be a public-facing catalog and access mechanism for our open access holdings.
-
-In the larger ecosystem, fatcat could also provide:
-
-- A work-level (as opposed to title-level) archival dashboard: what fraction of
-  all published works are preserved in archives? KBART, CLOCKSS, Portico, and
-  other preservations don't provide granular metadata
-- A collaborative, independent, non-commercial, fully-open, field-agnostic,
-  "completeness"-oriented catalog of scholarly metadata
-- Unified (centralized) foundation for discovery and access across repositories
-  and archives: discovery projects can focus on user experience instead of
-  building their own catalog from scratch
-- Research corpus for meta-science, with an emphasis on availability and
-  reproducibility (metadata corpus itself is open access, and file-level hashes
-  control for content drift)
-- Foundational infrastructure for distributed digital preservation
-- On-ramp for non-traditional digital works ("grey literature") into the
-  scholarly web
-
-## Scope
-
-The goal is to capture the "scholarly web": the graph of written works that
-cite other works. Any work that is both cited more than once and cites more
-than one other work in the catalog is very likely to be in scope. "Leaf nodes"
-and small islands of intra-cited works may or may not be in scope.
-
-fatcat would not include any fulltext content itself, even for cleanly licensed
-(open access) works, but would have "strong" (verified) links to fulltext
-content, and would include file-level metadata (like hashes and fingerprints)
-to help discovery and identify content from any source. File-level URLs with
-context ("repository", "author-homepage", "web-archive") should make fatcat
-more useful for both humans and machines to quickly access fulltext content of
-a given mimetype than existing redirect or landing page systems. So another
-factor in deciding scope is whether a work has "digital fixity" and can be
-contained in a single immutable file.
-
-## References and Previous Work
-
-The closest overall analog of fatcat is [MusicBrainz][mb], a collaboratively
-edited music database. [Open Library][ol] is a very similar existing service,
-which exclusively contains book metadata.
-
-[Wikidata][wd] seems to be the most successful and actively edited/developed
-open bibliographic database at this time (early 2018), including the
-[wikicite][wikicite] conference and related Wikimedia/Wikipedia projects.
-Wikidata is a general purpose semantic database of entities, facts, and
-relationships; bibliographic metadata has become a large fraction of all
-content in recent years. The focus there seems to be linking knowledge
-(statements) to specific sources unambiguously. Potential advantages fatcat
-would have would be a focus on a specific scope (not a general-purpose database
-of entities) and a goal of completeness (capturing as many works and
-relationships as rapidly as possible). However, it might be better to just
-pitch in to the wikidata efforts.
-
-The technical design of fatcat is loosely inspired by the git
-branch/tag/commit/tree architecture, and specifically inspired by Oliver
-Charles' "New Edit System" [blog posts][nes-blog] from 2012.
-
-There are a whole bunch of proprietary, for-profit bibliographic databases,
-including Web of Science, Google Scholar, Microsoft Academic Graph, aminer,
-Scopus, and Dimensions. There are excellent field-limited databases like dblp,
-MEDLINE, and Semantic Scholar. There are some large general-purpose databases
-that are not directly user-editable, including the OpenCitation corpus, CORE,
-BASE, and CrossRef. I don't know of any large (more than 60 million works),
-open (bulk-downloadable with permissive or no license), field agnostic,
-user-editable corpus of scholarly publication bibliographic metadata.
-
-[nes-blog]: https://ocharles.org.uk/blog/posts/2012-07-10-nes-does-it-better-1.html
-[mb]: https://musicbrainz.org
-[ol]: https://openlibrary.org
-[wd]: https://wikidata.org
-[wikicite]: https://meta.wikimedia.org/wiki/WikiCite_2017
-
-## Further Reading
-
-"From ISIS to CouchDB: Databases and Data Models for Bibliographic Records" by Luciano G. Ramalho. code4lib, 2013. <https://journal.code4lib.org/articles/4893>
-
-"Representing bibliographic data in JSON". github README file, 2017. <https://github.com/rdmpage/bibliographic-metadata-json>
-
-"Citation Style Language", <https://citationstyles.org/>
-
-"Functional Requirements for Bibliographic Records", Wikipedia article, <https://en.wikipedia.org/wiki/Functional_Requirements_for_Bibliographic_Records>
-
-OpenCitations and I40C <http://opencitations.net/>, <https://i4oc.org/>
-
diff --git a/guide/src/sources.md b/guide/src/sources.md
index e70306d4..b8853d8a 100644
--- a/guide/src/sources.md
+++ b/guide/src/sources.md
@@ -1 +1,29 @@
 # Sources
+
+The core metadata bootstrap sources, by entity type, are:
+
+- `releases`: Crossref metadata, with DOIs as the primary identifier, and
+  PubMed (central), Wikidata, and [CORE]() identifiers cross-referenced
+- `containers`: munged metadata from the DOAJ, ROAD, and Norwegian journal
+  list, with ISSN-Ls as the primary identifier. ISSN provides an "ISSN to
+  ISSN-L" mapping to normalize electronic and print ISSN numbers.
+- `creators`: ORCID metadata and identifier.
+
+Initial `file` metadata and matches (file-to-release) come from earlier
+Internet Archive matching efforts, and in particular efforts to extra
+bibliographic metadata from PDFs (using GROBID) and fuzzy match (with
+conservative settings) to Crossref metadata.
+
+[CORE]: https://core.ac.uk
+
+The intent is to continuously ingest and merge metadata from a small number of
+large (~2-3 million more more records) general-purpose aggregators and catalogs
+in a centralized fashion, using bots, and then support volunteers and
+organizations in writing bots to merge high-quality metadata from field or
+institution-specific catalogs.
+
+Progeny information (where the metadata comes from, or who "makes specific
+claims") is stored in edit metadata in the data model. Value-level attribution
+cna be achived by looking at the full edit history for an entity as a series of
+patches.
+
diff --git a/guide/src/style_guide.md b/guide/src/style_guide.md
index 944e68ce..1457a544 100644
--- a/guide/src/style_guide.md
+++ b/guide/src/style_guide.md
@@ -13,7 +13,7 @@ the release listed in the work itself
 This is not to be confused with *translations* of entire works, which should be
 treated as an entirely separate `release`.
 
-## Work/Release Distinction
+## Work/Release/File Distinctions
 
 ## External Identifiers
 
@@ -51,6 +51,30 @@ to auto-create a release for every registered DOI. In particular,
 aren't currently auto-created, but could be stored in "extra" metadata, or on a
 case-by-case basis.
 
+#### ISSN
+
+TODO
+
+#### ORCID
+
+TODO
+
+#### Wikidata QID
+
+TODO
+
+#### CORE Identifier
+
+TODO
+
+#### ISBN-13
+
+TODO
+
+#### PubMed (PMID and PMCID)
+
+TODO
+
 ## Human Names
 
 Representing names of human beings in databases is a fraught subject. For some
-- 
cgit v1.2.3