Merge branch 'bnewbold-content-scope'

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-22 16:12:01 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-22 16:12:01 -0800
commit: 5c7f50b2f497692493bfa54ad4741fdc573352ae (patch)
tree: c20cce1884076fffe210ba28e1a569f93ed22827
parent: f3bd82c0308948a63645538bdd9511a503625499 (diff)
parent: dd00cec4164c1a1c31c8d9cffb92deb2e30b2211 (diff)
download: fatcat-5c7f50b2f497692493bfa54ad4741fdc573352ae.tar.gz
fatcat-5c7f50b2f497692493bfa54ad4741fdc573352ae.zip
33 files changed, 346 insertions, 28 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index af5d96c4..4f080a17 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,11 @@ See also:
 - [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
 
 
-## [UNRELEASED]
+## [0.5.0] - UNRELEASED
+
+Small change to the API schema (and SQL schema), adding the `content_scope`
+field to file, fileset, and webcapture entities. Because there is a SQL schema
+change, bumping to version v0.5.0.
 
 An outward-facing change is that the fatcat API server now attempts to
 "stabilize" array order within JSON responses by sorting elements by
@@ -28,6 +32,9 @@ was created.
 In particular, this may cause broad discrepencies compared to historical bulk
 metadata exports. New bulk exports will be generated with the new ordering.
 
+A number of content cleanups and changes are also taking place to the primary
+catalog (fatcat.wik), see the separate content CHANGELOG for details.
+
 ### Fixed
 
 - API array order stablization, using `ORDER BY` in `fatcatd`. See note above.
@@ -35,12 +42,22 @@ metadata exports. New bulk exports will be generated with the new ordering.
 ### Changed
 
 - broad python code style updates: formatting, lint rules, and type annotations
+- a number of internal refactors of metadata importers
+- stopped created a small number of Datacite-specific license slugs
+- stopped trying to "fix" double slashes in DOIs, in most cases
+- reduced amount of metadata stored in release `extra` field in Datacite
+  importer
 
 ### Added
 
+- `content_scope` field on file, fileset, and webcapture entities
 - initial fileset importers
 - JSON pseudo-API for reference string match/get interface
 
+### Removed
+
+- deleted deprecated `cdl_dash_dat` and `wayback_static` one-time importers
+
 ## [0.4.0] - 2021-10-14
 
 Includes small API and SQL schema changes; see
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
index a8dbc6d0..34e2b0b3 100644
--- a/extra/elasticsearch/file_schema.json
+++ b/extra/elasticsearch/file_schema.json
@@ -37,6 +37,7 @@
             "release_ids":      { "type": "keyword", "normalizer": "default", "doc_values": false },
             "release_count":    { "type": "integer" },
             "mimetype":         { "type": "keyword", "normalizer": "default" },
+            "content_scope":    { "type": "keyword", "normalizer": "default" },
             "size_bytes":       { "type": "integer" },
             "sha1":             { "type": "keyword", "normalizer": "default", "doc_values": false },
             "sha256":           { "type": "keyword", "normalizer": "default", "doc_values": false },
diff --git a/fatcat-openapi2.yml b/fatcat-openapi2.yml
index 67915dda..7fafdb89 100644
--- a/fatcat-openapi2.yml
+++ b/fatcat-openapi2.yml
@@ -2,7 +2,7 @@
 swagger: "2.0"
 info:
   title: fatcat
-  version: 0.4.0
+  version: 0.5.0
   description: |
     Fatcat is a scalable, versioned, API-oriented catalog of bibliographic
     entities and file metadata.
@@ -460,6 +460,9 @@ definitions:
       mimetype:
         type: string
         example: "application/pdf"
+      content_scope:
+        type: string
+        example: "issue"
       release_ids:
         type: array
         items:
@@ -500,6 +503,9 @@ definitions:
     type: object
     properties:
       <<: *ENTITYPROPS
+      content_scope:
+        type: string
+        example: "issue"
       manifest:
         # limit of 200 files, at least to start
         type: array
@@ -601,6 +607,9 @@ definitions:
             Same format as CDX line timestamp (UTC, etc). Corresponds to the
             overall capture timestamp. Should generally be the timestamp of
             capture of the primary resource URL.
+      content_scope:
+        type: string
+        example: "landing-page"
       release_ids:
         type: array
         items:
diff --git a/guide/src/entity_file.md b/guide/src/entity_file.md
index 7429c982..84d9eac4 100644
--- a/guide/src/entity_file.md
+++ b/guide/src/entity_file.md
@@ -13,9 +13,13 @@
 - `urls`: An array of "typed" URLs. Order is not meaningful, and may not be
   preserved.
     - `url` (string, required): Eg: "https://example.edu/~frau/prcding.pdf".
-    - `rel` (string, required): Eg: "webarchive".
+    - `rel` (string, required): Eg: "webarchive", see vocabulary below.
 - `mimetype` (string): Format of the file. If XML, specific schema can be
   included after a `+`. Example: "application/pdf"
+- `content_scope` (string): for situations where the file does not simply
+  contain the full representation of a work (eg, fulltext of an article, for an
+  `article-journal` release), describes what that scope of coverage is. Eg,
+  entire `issue`, `corrupt` file. See vocabulary below.
 - `release_ids` (array of string identifiers): references to `release` entities
   that this file represents a manifestation of. Note that a single file can
   contain multiple release references (eg, a PDF containing a full issue with
@@ -35,3 +39,37 @@
   Scholar
 - `dweb`: content hosted on distributed/decentralized web protocols, such as
   `dat://` or `ipfs://` URLs
+
+#### `content_scope` Vocabulary
+
+This same vocabulary is shared between file, fileset, and webcapture entities;
+not all the fields make sense for each entity type.
+
+- if not set, assume that the artifact entity is valid and represents a
+  complete copy of the release
+- `issue`: artifact contains an entire issue of a serial publication (eg, issue
+  of a journal), representing several releases in full
+- `abstract`: contains only an abstract (short description) of the release, not
+  the release itself (unless the `release_type` itself is `abstract`, in which
+  case it is the entire release)
+- `index`: index of a journal, or series of abstracts from a conference
+- `slides`: slide deck (usually in "landscape" orientation)
+- `front-matter`: non-article content from a journal, such as editorial policies
+- `supplement`: usually a file entity which is a supplement or appendix, not
+  the entire work
+- `component`: a sub-component of a release, which may or may not be associated
+  with a `component` release entity. For example, a single figure or table as
+  part of an article
+- `poster`: digital copy of a poster, eg as displayed at conference poster sessions
+- `sample`: a partial sample of the entire work. eg, just the first page of an
+  article. distinct from `truncated`
+- `truncated`: the file has been truncated at a binary level, and may also be
+  corrupt or invalid. distinct from `sample`
+- `corrupt`: broken, mangled, or corrupt file (at the binary level)
+- `stub`: any other out-of-scope artifact situations, where the artifact
+  represents something which would not link to any possible in-scope release in
+  the catalog (except a `stub` release)
+- `landing-page`: for webcapture, the landing page of a work, as opposed to the
+  work itself
+- `spam`: content is spam. articles, webpages, or issues which include
+  incidental advertisements within them are not counted as `spam`
diff --git a/guide/src/entity_fileset.md b/guide/src/entity_fileset.md
index e1ac3e67..6083a09d 100644
--- a/guide/src/entity_fileset.md
+++ b/guide/src/entity_fileset.md
@@ -21,6 +21,10 @@
     - `rel` (string, required):
             Eg: "webarchive".
 - `release_ids` (array of string identifiers): references to `release` entities
+- `content_scope` (string): for situations where the fileset does not simply
+  contain the full representation of a work (eg, all files in dataset, for a
+  `dataset` release), describes what that scope of coverage is. Uses same
+  vocabulary as File entity.
 - `extra` (object with string keys): additional metadata about this group of
   files, including upstream platform-specific metadata and identifiers
 
diff --git a/guide/src/entity_webcapture.md b/guide/src/entity_webcapture.md
index 8c5615fb..1b3cac55 100644
--- a/guide/src/entity_webcapture.md
+++ b/guide/src/entity_webcapture.md
@@ -29,4 +29,10 @@ Warning: This schema is not yet stable.
 - `timestamp` (string, datetime): same format as CDX line timestamp (UTC, etc).
   Corresponds to the overall capture timestamp. Can be the earliest of CDX
   timestamps if that makes sense
+- `content_scope` (string): for situations where the webcapture does not simply
+  contain the full representation of a work (eg, HTML fulltext, for an
+  `article-journal` release), describes what that scope of coverage is. Eg,
+  `landing-page` it doesn't contain the full content. Landing pages are
+  out-of-scope for fatcat, but if they were accidentally imported, should mark
+  them as such so they aren't re-imported. Uses same vocabulary as File entity.
 - `release_ids` (array of string identifiers): references to `release` entities
diff --git a/proposals/2021-11-17_content_scope.md b/proposals/2021-11-17_content_scope.md
new file mode 100644
index 00000000..8d04808e
--- /dev/null
+++ b/proposals/2021-11-17_content_scope.md
@@ -0,0 +1,84 @@
+
+status: planned
+
+Content Scope Fields
+======================
+
+Usually, "artifact" entities (file, fileset, webcapture) should not contain
+bibliographic metadata about their contents. For example, a file entity
+describing a PDF of a journal article should not indicate the publication
+stage, retraction status, publication type, journal ISSN, or other metadata
+about that article; the `release` entity should contain that information.
+Additionally, it is usually assumed that a single "artifact" entity is a
+complete representation of any associated release entities: the complete
+dataset, or complete article.
+
+This document describes a new metadata field to handle some special cases that
+go against this principle: the `content_scope` of a file, fileset, or
+webcapture. It is intended to be used when there is an exception to the
+assumption that a single "artifact" is a complete representation of a release.
+It is particularly useful when there is a problem with the artifact, resulting
+with it being disassociated with all releases.
+
+
+## Values
+
+This section will get copied to the guide.
+
+- if not set, assume that the artifact entity is valid and represents a
+  complete copy of the release
+- `issue`: artifact contains an entire issue of a serial publication (eg, issue
+  of a journal), representing several releases in full
+- `abstract`: contains only an abstract (short description) of the release, not
+  the release itself (unless the `release_type` itself is `abstract`, in which
+  case it is the entire release)
+- `index`: index of a journal, or series of abstracts from a conference (TODO:
+  separate value for conference abstract lists?)
+- `slides`: slide deck (usually in "landscape" orientation)
+- `front-matter`: non-article content from a journal, such as editorial policies
+- `supplement`: usually a file entity which is a supplement or appendix, not
+  the entire work
+- `component`: a sub-component of a release, which may or may not be associated
+  with a `component` release entity. For example, a single figure or table as
+  part of an article
+- `poster`: digital copy of a poster, eg as displayed at conference poster sessions
+- `sample`: a partial sample of the entire work. eg, just the first page of an
+  article. distinct from `truncated`
+- `truncated`: the file has been truncated at a binary level, and may also be
+  corrupt or invalid. distinct from `sample`
+- `corrupt`: broken, mangled, or corrupt file (at the binary level)
+- `stub`: any other out-of-scope artifact situations, where the artifact
+  represents something which would not link to any possible in-scope release in
+  the catalog (except a `stub` release)
+- `landing-page`: for webcapture, the landing page of a work, as opposed to the
+  work itself
+- `spam`: content is spam. articles, webpages, or issues which include
+  incidental advertisements within them are not counted as `spam`
+
+
+## Implementation
+
+The string field `content_scope` will be added to file, fileset, and webcapture
+entities.
+
+By default, this field does not need to be set. If it is empty, it can be
+assumed that the artifact represents an appropriate copy of the full release.
+If it is set, and the artifact is associated with one or more releases,
+downstream users/code may want to verify that the `content_scope` and
+`release_type` values are consistent. For example, `slides` is not consistent
+with `article-journal`, so such a file should be marked for review, and not
+considered a valid access option or preservation copy for the purposes of
+coverage analysis.
+
+
+## Removing Release Linkage
+
+In cases where the "artifact" entity is not an acceptable representation of any
+release (eg, truncation, corruption, spam), the entity should have the
+`release_ids` field cleared.
+
+Optionally, the new `extra` field `related_release_ids` can be used to indicate
+that an artifact entity has something to do with specific releases, but is not
+a full representation of them. This can be useful for corrupt or partial
+content to link to releases it is a partial representation of.
+
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index a5c26410..2c7d08b0 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -262,7 +262,7 @@
         },
         "fatcat-openapi-client": {
             "path": "./../python_openapi_client",
-            "version": "==0.4.0"
+            "version": "==0.5.0"
         },
         "filelock": {
             "hashes": [
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index d4962205..c16053ec 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -650,6 +650,7 @@ def file_to_elasticsearch(entity: FileEntity) -> Dict[str, Any]:
         release_ids=entity.release_ids,
         release_count=len(entity.release_ids),
         mimetype=entity.mimetype,
+        content_scope=entity.content_scope,
         size_bytes=entity.size,
         sha1=entity.sha1,
         sha256=entity.sha256,
diff --git a/python/tests/api_files.py b/python/tests/api_files.py
index 8f9caf3e..cd08eef7 100644
--- a/python/tests/api_files.py
+++ b/python/tests/api_files.py
@@ -13,6 +13,7 @@ def test_file(api):
         sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2",
         sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3",
         mimetype="application/pdf",
+        content_scope="article",
         urls=[
             FileUrl(
                 url="https://web.archive.org/web/12345542/something.com/blah.pdf",
@@ -39,6 +40,7 @@ def test_file(api):
     assert f1.sha1 == f2.sha1
     assert f1.sha256 == f2.sha256
     assert f1.mimetype == f2.mimetype
+    assert f1.content_scope == f2.content_scope
     assert f1.extra == f2.extra
     assert f1.urls == f2.urls
     assert f1.release_ids == f2.release_ids
diff --git a/python/tests/api_filesets.py b/python/tests/api_filesets.py
index d7654eb9..8ab658f5 100644
--- a/python/tests/api_filesets.py
+++ b/python/tests/api_filesets.py
@@ -34,6 +34,7 @@ def test_fileset(api):
             FilesetUrl(url="https://humble-host.com/~user123/dataset/", rel="web"),
         ],
         release_ids=[r1edit.ident],
+        content_scope="dataset",
         extra=dict(t=4, u=9),
         edit_extra=dict(test_key="filesets rule"),
     )
@@ -52,6 +53,7 @@ def test_fileset(api):
     assert fs1.urls == fs2.urls
     assert fs1.manifest == fs2.manifest
     assert fs1.release_ids == fs2.release_ids
+    assert fs1.content_scope == fs2.content_scope
     assert fs1.extra == fs2.extra
 
     # expansion
diff --git a/python/tests/api_webcaptures.py b/python/tests/api_webcaptures.py
index 76bc68c0..6a477ff2 100644
--- a/python/tests/api_webcaptures.py
+++ b/python/tests/api_webcaptures.py
@@ -44,6 +44,7 @@ def test_webcapture(api):
             FileUrl(rel="wayback", url="https://web.archive.org/web/"),
         ],
         release_ids=[r1edit.ident],
+        content_scope="landing-page",
         extra=dict(c=1, b=2),
         edit_extra=dict(test_key="webcaptures rule"),
     )
@@ -69,6 +70,7 @@ def test_webcapture(api):
     assert wc1.release_ids == wc2.release_ids
     assert wc1.timestamp == wc2.timestamp
     assert wc1.original_url == wc2.original_url
+    assert wc1.content_scope == wc2.content_scope
     assert wc1.extra == wc2.extra
 
     # check release expansion
diff --git a/python_openapi_client/README.md b/python_openapi_client/README.md
index 8cc34147..316b9dfd 100644
--- a/python_openapi_client/README.md
+++ b/python_openapi_client/README.md
@@ -3,8 +3,8 @@ Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities
 
 This Python package is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project:
 
-- API version: 0.4.0
-- Package version: 0.4.0
+- API version: 0.5.0
+- Package version: 0.5.0
 - Build package: org.openapitools.codegen.languages.PythonClientCodegen
 For more information, please visit [https://fatcat.wiki](https://fatcat.wiki)
 
diff --git a/python_openapi_client/codegen_python_client.sh b/python_openapi_client/codegen_python_client.sh
index bbf2ad83..17782c68 100755
--- a/python_openapi_client/codegen_python_client.sh
+++ b/python_openapi_client/codegen_python_client.sh
@@ -20,7 +20,7 @@ docker run \
     --input-spec /tmp/swagger/api.yml \
     --output /tmp/swagger/ \
     --package-name=fatcat_openapi_client \
-    -p packageVersion="0.4.0"
+    -p packageVersion="0.5.0"
 
 sudo chown -R `whoami`:`whoami` $OUTPUT
 mkdir -p fatcat_openapi_client
diff --git a/python_openapi_client/fatcat_openapi_client/__init__.py b/python_openapi_client/fatcat_openapi_client/__init__.py
index 5f1a7fba..8749264e 100644
--- a/python_openapi_client/fatcat_openapi_client/__init__.py
+++ b/python_openapi_client/fatcat_openapi_client/__init__.py
@@ -7,7 +7,7 @@
 
     Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata.   # noqa: E501
 
-    The version of the OpenAPI document: 0.4.0
+    The version of the OpenAPI document: 0.5.0
     Contact: webservices@archive.org
     Generated by: https://openapi-generator.tech
 """
@@ -15,7 +15,7 @@
 
 from __future__ import absolute_import
 
-__version__ = "0.4.0"
+__version__ = "0.5.0"
 
 # import apis into sdk package
 from fatcat_openapi_client.api.default_api import DefaultApi
diff --git a/python_openapi_client/fatcat_openapi_client/__version__.py b/python_openapi_client/fatcat_openapi_client/__version__.py
index 618922fe..b0634514 100644
--- a/python_openapi_client/fatcat_openapi_client/__version__.py
+++ b/python_openapi_client/fatcat_openapi_client/__version__.py
@@ -1,3 +1,3 @@
 
-VERSION = (0, 4, 0) # eg, (0, 2, '0dev0')
+VERSION = (0, 5, 0) # eg, (0, 2, '0dev0')
 __version__ = '.'.join(map(str, VERSION))
diff --git a/python_openapi_client/fatcat_openapi_client/api_client.py b/python_openapi_client/fatcat_openapi_client/api_client.py
index efef8cbf..eb23f1cd 100644
--- a/python_openapi_client/fatcat_openapi_client/api_client.py
+++ b/python_openapi_client/fatcat_openapi_client/api_client.py
@@ -4,7 +4,7 @@
 
     Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata.   # noqa: E501
 
-    The version of the OpenAPI document: 0.4.0
+    The version of the OpenAPI document: 0.5.0
     Contact: webservices@archive.org
     Generated by: https://openapi-generator.tech
 """
@@ -77,7 +77,7 @@ class ApiClient(object):
             self.default_headers[header_name] = header_value
         self.cookie = cookie
         # Set default User-Agent.
-        self.user_agent = 'OpenAPI-Generator/0.4.0/python'
+        self.user_agent = 'OpenAPI-Generator/0.5.0/python'
 
     def __del__(self):
         if self._pool:
diff --git a/python_openapi_client/fatcat_openapi_client/configuration.py b/python_openapi_client/fatcat_openapi_client/configuration.py
index c0e39620..dacf77c9 100644
--- a/python_openapi_client/fatcat_openapi_client/configuration.py
+++ b/python_openapi_client/fatcat_openapi_client/configuration.py
@@ -5,7 +5,7 @@
 
     Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata.   # noqa: E501
 
-    The version of the OpenAPI document: 0.4.0
+    The version of the OpenAPI document: 0.5.0
     Contact: webservices@archive.org
     Generated by: https://openapi-generator.tech
 """
@@ -267,8 +267,8 @@ class Configuration(object):
         return "Python SDK Debug Report:\n"\
                "OS: {env}\n"\
                "Python Version: {pyversion}\n"\
-               "Version of the API: 0.4.0\n"\
-               "SDK Package Version: 0.4.0".\
+               "Version of the API: 0.5.0\n"\
+               "SDK Package Version: 0.5.0".\
                format(env=sys.platform, pyversion=sys.version)
 
     def get_host_settings(self):
diff --git a/python_openapi_client/fatcat_openapi_client/models/file_entity.py b/python_openapi_client/fatcat_openapi_client/models/file_entity.py
index e52635a6..d97a0a03 100644
--- a/python_openapi_client/fatcat_openapi_client/models/file_entity.py
+++ b/python_openapi_client/fatcat_openapi_client/models/file_entity.py
@@ -5,7 +5,7 @@
 
     Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata.   # noqa: E501
 
-    The version of the OpenAPI document: 0.3.1
+    The version of the OpenAPI document: 0.5.0
     Contact: webservices@archive.org
     Generated by: https://openapi-generator.tech
 """
@@ -44,6 +44,7 @@ class FileEntity(object):
         'sha256': 'str',
         'urls': 'list[FileUrl]',
         'mimetype': 'str',
+        'content_scope': 'str',
         'release_ids': 'list[str]',
         'releases': 'list[ReleaseEntity]'
     }
@@ -61,11 +62,12 @@ class FileEntity(object):
         'sha256': 'sha256',
         'urls': 'urls',
         'mimetype': 'mimetype',
+        'content_scope': 'content_scope',
         'release_ids': 'release_ids',
         'releases': 'releases'
     }
 
-    def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, size=None, md5=None, sha1=None, sha256=None, urls=None, mimetype=None, release_ids=None, releases=None):  # noqa: E501
+    def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, size=None, md5=None, sha1=None, sha256=None, urls=None, mimetype=None, content_scope=None, release_ids=None, releases=None):  # noqa: E501
         """FileEntity - a model defined in OpenAPI"""  # noqa: E501
 
         self._state = None
@@ -80,6 +82,7 @@ class FileEntity(object):
         self._sha256 = None
         self._urls = None
         self._mimetype = None
+        self._content_scope = None
         self._release_ids = None
         self._releases = None
         self.discriminator = None
@@ -108,6 +111,8 @@ class FileEntity(object):
             self.urls = urls
         if mimetype is not None:
             self.mimetype = mimetype
+        if content_scope is not None:
+            self.content_scope = content_scope
         if release_ids is not None:
             self.release_ids = release_ids
         if releases is not None:
@@ -426,6 +431,27 @@ class FileEntity(object):
         self._mimetype = mimetype
 
     @property
+    def content_scope(self):
+        """Gets the content_scope of this FileEntity.  # noqa: E501
+
+
+        :return: The content_scope of this FileEntity.  # noqa: E501
+        :rtype: str
+        """
+        return self._content_scope
+
+    @content_scope.setter
+    def content_scope(self, content_scope):
+        """Sets the content_scope of this FileEntity.
+
+
+        :param content_scope: The content_scope of this FileEntity.  # noqa: E501
+        :type: str
+        """
+
+        self._content_scope = content_scope
+
+    @property
     def release_ids(self):
         """Gets the release_ids of this FileEntity.  # noqa: E501
 
diff --git a/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py b/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py
index 51952d2a..dfc0787a 100644
--- a/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py
+++ b/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py
@@ -5,7 +5,7 @@
 
     Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata.   # noqa: E501
 
-    The version of the OpenAPI document: 0.3.1
+    The version of the OpenAPI document: 0.5.0
     Contact: webservices@archive.org
     Generated by: https://openapi-generator.tech
 """
@@ -38,6 +38,7 @@ class FilesetEntity(object):
         'redirect': 'str',
         'extra': 'dict(str, object)',
         'edit_extra': 'dict(str, object)',
+        'content_scope': 'str',
         'manifest': 'list[FilesetFile]',
         'urls': 'list[FilesetUrl]',
         'release_ids': 'list[str]',
@@ -51,13 +52,14 @@ class FilesetEntity(object):
         'redirect': 'redirect',
         'extra': 'extra',
         'edit_extra': 'edit_extra',
+        'content_scope': 'content_scope',
         'manifest': 'manifest',
         'urls': 'urls',
         'release_ids': 'release_ids',
         'releases': 'releases'
     }
 
-    def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, manifest=None, urls=None, release_ids=None, releases=None):  # noqa: E501
+    def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, content_scope=None, manifest=None, urls=None, release_ids=None, releases=None):  # noqa: E501
         """FilesetEntity - a model defined in OpenAPI"""  # noqa: E501
 
         self._state = None
@@ -66,6 +68,7 @@ class FilesetEntity(object):
         self._redirect = None
         self._extra = None
         self._edit_extra = None
+        self._content_scope = None
         self._manifest = None
         self._urls = None
         self._release_ids = None
@@ -84,6 +87,8 @@ class FilesetEntity(object):
             self.extra = extra
         if edit_extra is not None:
             self.edit_extra = edit_extra
+        if content_scope is not None:
+            self.content_scope = content_scope
         if manifest is not None:
             self.manifest = manifest
         if urls is not None:
@@ -254,6 +259,27 @@ class FilesetEntity(object):
         self._edit_extra = edit_extra
 
     @property
+    def content_scope(self):
+        """Gets the content_scope of this FilesetEntity.  # noqa: E501
+
+
+        :return: The content_scope of this FilesetEntity.  # noqa: E501
+        :rtype: str
+        """
+        return self._content_scope
+
+    @content_scope.setter
+    def content_scope(self, content_scope):
+        """Sets the content_scope of this FilesetEntity.
+
+
+        :param content_scope: The content_scope of this FilesetEntity.  # noqa: E501
+        :type: str
+        """
+
+        self._content_scope = content_scope
+
+    @property
     def manifest(self):
         """Gets the manifest of this FilesetEntity.  # noqa: E501
 
diff --git a/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py b/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py
index 82363c23..968b0b1c 100644
--- a/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py
+++ b/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py
@@ -5,7 +5,7 @@
 
     Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata.   # noqa: E501
 
-    The version of the OpenAPI document: 0.3.1
+    The version of the OpenAPI document: 0.5.0
     Contact: webservices@archive.org
     Generated by: https://openapi-generator.tech
 """
@@ -42,6 +42,7 @@ class WebcaptureEntity(object):
         'archive_urls': 'list[WebcaptureUrl]',
         'original_url': 'str',
         'timestamp': 'datetime',
+        'content_scope': 'str',
         'release_ids': 'list[str]',
         'releases': 'list[ReleaseEntity]'
     }
@@ -57,11 +58,12 @@ class WebcaptureEntity(object):
         'archive_urls': 'archive_urls',
         'original_url': 'original_url',
         'timestamp': 'timestamp',
+        'content_scope': 'content_scope',
         'release_ids': 'release_ids',
         'releases': 'releases'
     }
 
-    def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, cdx=None, archive_urls=None, original_url=None, timestamp=None, release_ids=None, releases=None):  # noqa: E501
+    def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, cdx=None, archive_urls=None, original_url=None, timestamp=None, content_scope=None, release_ids=None, releases=None):  # noqa: E501
         """WebcaptureEntity - a model defined in OpenAPI"""  # noqa: E501
 
         self._state = None
@@ -74,6 +76,7 @@ class WebcaptureEntity(object):
         self._archive_urls = None
         self._original_url = None
         self._timestamp = None
+        self._content_scope = None
         self._release_ids = None
         self._releases = None
         self.discriminator = None
@@ -98,6 +101,8 @@ class WebcaptureEntity(object):
             self.original_url = original_url
         if timestamp is not None:
             self.timestamp = timestamp
+        if content_scope is not None:
+            self.content_scope = content_scope
         if release_ids is not None:
             self.release_ids = release_ids
         if releases is not None:
@@ -352,6 +357,27 @@ class WebcaptureEntity(object):
         self._timestamp = timestamp
 
     @property
+    def content_scope(self):
+        """Gets the content_scope of this WebcaptureEntity.  # noqa: E501
+
+
+        :return: The content_scope of this WebcaptureEntity.  # noqa: E501
+        :rtype: str
+        """
+        return self._content_scope
+
+    @content_scope.setter
+    def content_scope(self, content_scope):
+        """Sets the content_scope of this WebcaptureEntity.
+
+
+        :param content_scope: The content_scope of this WebcaptureEntity.  # noqa: E501
+        :type: str
+        """
+
+        self._content_scope = content_scope
+
+    @property
     def release_ids(self):
         """Gets the release_ids of this WebcaptureEntity.  # noqa: E501
 
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index 716f2b6b..6fa62c5f 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -607,7 +607,7 @@ dependencies = [
 
 [[package]]
 name = "fatcat"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "cadence",
  "chrono 0.4.6",
@@ -648,7 +648,7 @@ dependencies = [
 
 [[package]]
 name = "fatcat-openapi"
-version = "0.4.0"
+version = "0.5.0"
 dependencies = [
  "bodyparser",
  "chrono 0.4.6",
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index e2ce5e41..b521316e 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fatcat"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2018"
 authors = ["Bryan Newbold <bnewbold@archive.org>"]
 description = "A scalable, versioned, API-oriented catalog for bibliographic entities and file metadata"
diff --git a/rust/fatcat-openapi/Cargo.toml b/rust/fatcat-openapi/Cargo.toml
index 7f417242..f532b780 100644
--- a/rust/fatcat-openapi/Cargo.toml
+++ b/rust/fatcat-openapi/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "fatcat-openapi"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2018"
 authors = ["Bryan Newbold <bnewbold@archive.org>"]
 description = "Fatcat is an editable bibliographic database. This OpenAPI code-generated crate container HTTP API models, endpoints, and other auto-generated types useful for both client and server implementations of the catalog API."
diff --git a/rust/fatcat-openapi/README.md b/rust/fatcat-openapi/README.md
index 327fe3a8..0ed53d3d 100644
--- a/rust/fatcat-openapi/README.md
+++ b/rust/fatcat-openapi/README.md
@@ -12,8 +12,8 @@ To see how to make this your own, look here:
 
 [README](https://github.com/swagger-api/swagger-codegen/blob/master/README.md)
 
-- API version: 0.4.0
-- Build date: 2021-10-12T23:51:46.767Z
+- API version: 0.5.0
+- Build date: 2021-11-17T22:18:19.232Z
 For more information, please visit [https://fatcat.wiki](https://fatcat.wiki)
 
 This autogenerated project defines an API crate `fatcat` which contains:
@@ -172,7 +172,7 @@ The server example is designed to form the basis for implementing your own serve
 
 * Set up a new Rust project, e.g., with `cargo init --bin`.
 * Insert `fatcat` into the `members` array under [workspace] in the root `Cargo.toml`, e.g., `members = [ "fatcat" ]`.
-* Add `fatcat = {version = "0.4.0", path = "fatcat"}` under `[dependencies]` in the root `Cargo.toml`.
+* Add `fatcat = {version = "0.5.0", path = "fatcat"}` under `[dependencies]` in the root `Cargo.toml`.
 * Copy the `[dependencies]` and `[dev-dependencies]` from `fatcat/Cargo.toml` into the root `Cargo.toml`'s `[dependencies]` section.
   * Copy all of the `[dev-dependencies]`, but only the `[dependencies]` that are required by the example server. These should be clearly indicated by comments.
   * Remove `"optional = true"` from each of these lines if present.
diff --git a/rust/fatcat-openapi/src/models.rs b/rust/fatcat-openapi/src/models.rs
index ca203c61..36c9105f 100644
--- a/rust/fatcat-openapi/src/models.rs
+++ b/rust/fatcat-openapi/src/models.rs
@@ -628,6 +628,10 @@ pub struct FileEntity {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub release_ids: Option<Vec<String>>,
 
+    #[serde(rename = "content_scope")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content_scope: Option<String>,
+
     #[serde(rename = "mimetype")]
     #[serde(skip_serializing_if = "Option::is_none")]
     pub mimetype: Option<String>,
@@ -692,6 +696,7 @@ impl FileEntity {
         FileEntity {
             releases: None,
             release_ids: None,
+            content_scope: None,
             mimetype: None,
             urls: None,
             sha256: None,
@@ -763,6 +768,10 @@ pub struct FilesetEntity {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub manifest: Option<Vec<models::FilesetFile>>,
 
+    #[serde(rename = "content_scope")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content_scope: Option<String>,
+
     // Note: inline enums are not fully supported by swagger-codegen
     #[serde(rename = "state")]
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -801,6 +810,7 @@ impl FilesetEntity {
             release_ids: None,
             urls: None,
             manifest: None,
+            content_scope: None,
             state: None,
             ident: None,
             revision: None,
@@ -1453,6 +1463,10 @@ pub struct WebcaptureEntity {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub release_ids: Option<Vec<String>>,
 
+    #[serde(rename = "content_scope")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub content_scope: Option<String>,
+
     /// Same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Should generally be the timestamp of capture of the primary resource URL.
     #[serde(rename = "timestamp")]
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -1507,6 +1521,7 @@ impl WebcaptureEntity {
         WebcaptureEntity {
             releases: None,
             release_ids: None,
+            content_scope: None,
             timestamp: None,
             original_url: None,
             archive_urls: None,
diff --git a/rust/migrations/2021-11-17-222046_content_scope/down.sql b/rust/migrations/2021-11-17-222046_content_scope/down.sql
new file mode 100644
index 00000000..b2d55321
--- /dev/null
+++ b/rust/migrations/2021-11-17-222046_content_scope/down.sql
@@ -0,0 +1,10 @@
+-- This file should undo anything in `up.sql`
+
+ALTER TABLE file_rev
+DROP COLUMN content_scope;
+
+ALTER TABLE fileset_rev
+DROP COLUMN content_scope;
+
+ALTER TABLE webcapture_rev
+DROP COLUMN content_scope;
diff --git a/rust/migrations/2021-11-17-222046_content_scope/up.sql b/rust/migrations/2021-11-17-222046_content_scope/up.sql
new file mode 100644
index 00000000..82c5f2e6
--- /dev/null
+++ b/rust/migrations/2021-11-17-222046_content_scope/up.sql
@@ -0,0 +1,27 @@
+-- This is the v0.5.0 schema
+-- Add `content_scope` field to file, fileset, webcapture
+
+ALTER TABLE file_rev
+ADD COLUMN content_scope       TEXT CHECK (octet_length(content_scope) >= 1);
+
+ALTER TABLE fileset_rev
+ADD COLUMN content_scope       TEXT CHECK (octet_length(content_scope) >= 1);
+
+ALTER TABLE webcapture_rev
+ADD COLUMN content_scope       TEXT CHECK (octet_length(content_scope) >= 1);
+
+-------------------- Update Test Revs --------------------------------------
+-- IMPORTANT: don't create new entities here, only mutate existing
+
+BEGIN;
+
+UPDATE file_rev SET content_scope = 'article'
+WHERE id = '00000000-0000-0000-3333-FFF000000003';
+
+UPDATE fileset_rev SET content_scope = 'dataset'
+WHERE id = '00000000-0000-0000-6666-fff000000003';
+
+UPDATE webcapture_rev SET content_scope = 'webpage'
+WHERE id = '00000000-0000-0000-7777-FFF000000003';
+
+COMMIT;
diff --git a/rust/src/database_models.rs b/rust/src/database_models.rs
index 76c8675d..0427f9c8 100644
--- a/rust/src/database_models.rs
+++ b/rust/src/database_models.rs
@@ -220,6 +220,7 @@ pub struct FileRevRow {
     pub sha256: Option<String>,
     pub md5: Option<String>,
     pub mimetype: Option<String>,
+    pub content_scope: Option<String>,
 }
 
 #[derive(Debug, Associations, AsChangeset, Insertable)]
@@ -231,6 +232,7 @@ pub struct FileRevNewRow {
     pub sha256: Option<String>,
     pub md5: Option<String>,
     pub mimetype: Option<String>,
+    pub content_scope: Option<String>,
 }
 
 entity_structs!(
@@ -291,12 +293,14 @@ pub struct FilesetRevUrlNewRow {
 pub struct FilesetRevRow {
     pub id: Uuid,
     pub extra_json: Option<serde_json::Value>,
+    pub content_scope: Option<String>,
 }
 
 #[derive(Debug, Associations, AsChangeset, Insertable)]
 #[table_name = "fileset_rev"]
 pub struct FilesetRevNewRow {
     pub extra_json: Option<serde_json::Value>,
+    pub content_scope: Option<String>,
 }
 
 entity_structs!(
@@ -360,6 +364,7 @@ pub struct WebcaptureRevRow {
     pub extra_json: Option<serde_json::Value>,
     pub original_url: String,
     pub timestamp: chrono::NaiveDateTime,
+    pub content_scope: Option<String>,
 }
 
 #[derive(Debug, Associations, AsChangeset, Insertable)]
@@ -368,6 +373,7 @@ pub struct WebcaptureRevNewRow {
     pub extra_json: Option<serde_json::Value>,
     pub original_url: String,
     pub timestamp: chrono::NaiveDateTime,
+    pub content_scope: Option<String>,
 }
 
 entity_structs!(
diff --git a/rust/src/database_schema.rs b/rust/src/database_schema.rs
index e0a54233..e3d16202 100644
--- a/rust/src/database_schema.rs
+++ b/rust/src/database_schema.rs
@@ -163,6 +163,7 @@ table! {
         sha256 -> Nullable<Text>,
         md5 -> Nullable<Text>,
         mimetype -> Nullable<Text>,
+        content_scope -> Nullable<Text>,
     }
 }
 
@@ -208,6 +209,7 @@ table! {
     fileset_rev (id) {
         id -> Uuid,
         extra_json -> Nullable<Jsonb>,
+        content_scope -> Nullable<Text>,
     }
 }
 
@@ -372,6 +374,7 @@ table! {
         extra_json -> Nullable<Jsonb>,
         original_url -> Text,
         timestamp -> Timestamptz,
+        content_scope -> Nullable<Text>,
     }
 }
 
diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs
index 19cb58ea..f48246a5 100644
--- a/rust/src/entity_crud.rs
+++ b/rust/src/entity_crud.rs
@@ -1042,6 +1042,7 @@ impl EntityCrud for FileEntity {
             size: None,
             urls: None,
             mimetype: None,
+            content_scope: None,
             release_ids: None,
             releases: None,
             state: Some(ident_row.state().unwrap().shortname()),
@@ -1125,6 +1126,7 @@ impl EntityCrud for FileEntity {
             size: rev_row.size_bytes,
             urls: Some(urls),
             mimetype: rev_row.mimetype,
+            content_scope: rev_row.content_scope,
             release_ids: Some(release_ids.iter().map(|fcid| fcid.to_string()).collect()),
             releases: None,
             state,
@@ -1160,6 +1162,7 @@ impl EntityCrud for FileEntity {
                         sha256: model.sha256.clone(),
                         md5: model.md5.clone(),
                         mimetype: model.mimetype.clone(),
+                        content_scope: model.content_scope.clone(),
                         extra_json: model.extra.clone(),
                     })
                     .collect::<Vec<FileRevNewRow>>(),
@@ -1245,6 +1248,7 @@ impl EntityCrud for FilesetEntity {
         }
 
         Ok(FilesetEntity {
+            content_scope: None,
             manifest: None,
             urls: None,
             release_ids: None,
@@ -1340,6 +1344,7 @@ impl EntityCrud for FilesetEntity {
             .collect();
 
         Ok(FilesetEntity {
+            content_scope: rev_row.content_scope.clone(),
             manifest: Some(manifest),
             urls: Some(urls),
             release_ids: Some(release_ids.iter().map(|fcid| fcid.to_string()).collect()),
@@ -1376,6 +1381,7 @@ impl EntityCrud for FilesetEntity {
                 models
                     .iter()
                     .map(|model| FilesetRevNewRow {
+                        content_scope: model.content_scope.clone(),
                         extra_json: model.extra.clone(),
                     })
                     .collect::<Vec<FilesetRevNewRow>>(),
@@ -1492,6 +1498,7 @@ impl EntityCrud for WebcaptureEntity {
             archive_urls: None,
             original_url: None,
             timestamp: None,
+            content_scope: None,
             release_ids: None,
             releases: None,
             state: Some(ident_row.state().unwrap().shortname()),
@@ -1590,6 +1597,7 @@ impl EntityCrud for WebcaptureEntity {
             archive_urls: Some(archive_urls),
             original_url: Some(rev_row.original_url),
             timestamp: Some(chrono::DateTime::from_utc(rev_row.timestamp, chrono::Utc)),
+            content_scope: rev_row.content_scope,
             release_ids: Some(release_ids.iter().map(|fcid| fcid.to_string()).collect()),
             releases: None,
             state,
@@ -1628,6 +1636,7 @@ impl EntityCrud for WebcaptureEntity {
                         // these unwraps safe because of check above
                         original_url: model.original_url.clone().unwrap(),
                         timestamp: model.timestamp.unwrap().naive_utc(),
+                        content_scope: model.content_scope.clone(),
                         extra_json: model.extra.clone(),
                     })
                     .collect::<Vec<WebcaptureRevNewRow>>(),
diff --git a/rust/src/server.rs b/rust/src/server.rs
index 4d30ecbe..fb55f03c 100644
--- a/rust/src/server.rs
+++ b/rust/src/server.rs
@@ -72,6 +72,7 @@ pub fn create_test_server() -> Result<Server> {
     diesel_migrations::revert_latest_migration(&conn).unwrap();
     diesel_migrations::revert_latest_migration(&conn).unwrap();
     diesel_migrations::revert_latest_migration(&conn).unwrap();
+    diesel_migrations::revert_latest_migration(&conn).unwrap();
     diesel_migrations::run_pending_migrations(&conn).unwrap();
     Ok(server)
 }
diff --git a/rust/tests/test_api_server_http.rs b/rust/tests/test_api_server_http.rs
index 8f691e0d..0601a26b 100644
--- a/rust/tests/test_api_server_http.rs
+++ b/rust/tests/test_api_server_http.rs
@@ -636,6 +636,7 @@ fn test_post_file() {
                     {"url": "http://web.archive.org/2/http://archive.org/asdf.txt", "rel": "webarchive" }
                 ],
                 "mimetype": "application/pdf",
+                "content_scope": "article",
                 "release_ids": [
                     "aaaaaaaaaaaaarceaaaaaaaaae",
                     "aaaaaaaaaaaaarceaaaaaaaaai"
@@ -711,6 +712,7 @@ fn test_post_fileset() {
                     "aaaaaaaaaaaaarceaaaaaaaaae",
                     "aaaaaaaaaaaaarceaaaaaaaaai"
                 ],
+                "content_scope": "dataset",
                 "extra": { "source": "speculation" }
                 }"#,
             &router,
@@ -764,6 +766,7 @@ fn test_post_webcapture() {
             headers.clone(),
             r#"{"original_url": "https://bnewbold.net/",
                 "timestamp": "2018-12-28T05:06:07Z",
+                "content_scope": "landing-page",
                 "cdx": [
                     {"surt": "org,asheesh,)/robots.txt",
                      "timestamp": "2018-12-28T05:06:07Z",
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-22 16:12:01 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-22 16:12:01 -0800
commit	5c7f50b2f497692493bfa54ad4741fdc573352ae (patch)
tree	c20cce1884076fffe210ba28e1a569f93ed22827
parent	f3bd82c0308948a63645538bdd9511a503625499 (diff)
parent	dd00cec4164c1a1c31c8d9cffb92deb2e30b2211 (diff)
download	fatcat-5c7f50b2f497692493bfa54ad4741fdc573352ae.tar.gz fatcat-5c7f50b2f497692493bfa54ad4741fdc573352ae.zip