aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-22 16:12:01 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-22 16:12:01 -0800
commit5c7f50b2f497692493bfa54ad4741fdc573352ae (patch)
treec20cce1884076fffe210ba28e1a569f93ed22827
parentf3bd82c0308948a63645538bdd9511a503625499 (diff)
parentdd00cec4164c1a1c31c8d9cffb92deb2e30b2211 (diff)
downloadfatcat-5c7f50b2f497692493bfa54ad4741fdc573352ae.tar.gz
fatcat-5c7f50b2f497692493bfa54ad4741fdc573352ae.zip
Merge branch 'bnewbold-content-scope'
-rw-r--r--CHANGELOG.md19
-rw-r--r--extra/elasticsearch/file_schema.json1
-rw-r--r--fatcat-openapi2.yml11
-rw-r--r--guide/src/entity_file.md40
-rw-r--r--guide/src/entity_fileset.md4
-rw-r--r--guide/src/entity_webcapture.md6
-rw-r--r--proposals/2021-11-17_content_scope.md84
-rw-r--r--python/Pipfile.lock2
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py1
-rw-r--r--python/tests/api_files.py2
-rw-r--r--python/tests/api_filesets.py2
-rw-r--r--python/tests/api_webcaptures.py2
-rw-r--r--python_openapi_client/README.md4
-rwxr-xr-xpython_openapi_client/codegen_python_client.sh2
-rw-r--r--python_openapi_client/fatcat_openapi_client/__init__.py4
-rw-r--r--python_openapi_client/fatcat_openapi_client/__version__.py2
-rw-r--r--python_openapi_client/fatcat_openapi_client/api_client.py4
-rw-r--r--python_openapi_client/fatcat_openapi_client/configuration.py6
-rw-r--r--python_openapi_client/fatcat_openapi_client/models/file_entity.py30
-rw-r--r--python_openapi_client/fatcat_openapi_client/models/fileset_entity.py30
-rw-r--r--python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py30
-rw-r--r--rust/Cargo.lock4
-rw-r--r--rust/Cargo.toml2
-rw-r--r--rust/fatcat-openapi/Cargo.toml2
-rw-r--r--rust/fatcat-openapi/README.md6
-rw-r--r--rust/fatcat-openapi/src/models.rs15
-rw-r--r--rust/migrations/2021-11-17-222046_content_scope/down.sql10
-rw-r--r--rust/migrations/2021-11-17-222046_content_scope/up.sql27
-rw-r--r--rust/src/database_models.rs6
-rw-r--r--rust/src/database_schema.rs3
-rw-r--r--rust/src/entity_crud.rs9
-rw-r--r--rust/src/server.rs1
-rw-r--r--rust/tests/test_api_server_http.rs3
33 files changed, 346 insertions, 28 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index af5d96c4..4f080a17 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,11 @@ See also:
- [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
-## [UNRELEASED]
+## [0.5.0] - UNRELEASED
+
+Small change to the API schema (and SQL schema), adding the `content_scope`
+field to file, fileset, and webcapture entities. Because there is a SQL schema
+change, bumping to version v0.5.0.
An outward-facing change is that the fatcat API server now attempts to
"stabilize" array order within JSON responses by sorting elements by
@@ -28,6 +32,9 @@ was created.
In particular, this may cause broad discrepencies compared to historical bulk
metadata exports. New bulk exports will be generated with the new ordering.
+A number of content cleanups and changes are also taking place to the primary
+catalog (fatcat.wik), see the separate content CHANGELOG for details.
+
### Fixed
- API array order stablization, using `ORDER BY` in `fatcatd`. See note above.
@@ -35,12 +42,22 @@ metadata exports. New bulk exports will be generated with the new ordering.
### Changed
- broad python code style updates: formatting, lint rules, and type annotations
+- a number of internal refactors of metadata importers
+- stopped created a small number of Datacite-specific license slugs
+- stopped trying to "fix" double slashes in DOIs, in most cases
+- reduced amount of metadata stored in release `extra` field in Datacite
+ importer
### Added
+- `content_scope` field on file, fileset, and webcapture entities
- initial fileset importers
- JSON pseudo-API for reference string match/get interface
+### Removed
+
+- deleted deprecated `cdl_dash_dat` and `wayback_static` one-time importers
+
## [0.4.0] - 2021-10-14
Includes small API and SQL schema changes; see
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
index a8dbc6d0..34e2b0b3 100644
--- a/extra/elasticsearch/file_schema.json
+++ b/extra/elasticsearch/file_schema.json
@@ -37,6 +37,7 @@
"release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false },
"release_count": { "type": "integer" },
"mimetype": { "type": "keyword", "normalizer": "default" },
+ "content_scope": { "type": "keyword", "normalizer": "default" },
"size_bytes": { "type": "integer" },
"sha1": { "type": "keyword", "normalizer": "default", "doc_values": false },
"sha256": { "type": "keyword", "normalizer": "default", "doc_values": false },
diff --git a/fatcat-openapi2.yml b/fatcat-openapi2.yml
index 67915dda..7fafdb89 100644
--- a/fatcat-openapi2.yml
+++ b/fatcat-openapi2.yml
@@ -2,7 +2,7 @@
swagger: "2.0"
info:
title: fatcat
- version: 0.4.0
+ version: 0.5.0
description: |
Fatcat is a scalable, versioned, API-oriented catalog of bibliographic
entities and file metadata.
@@ -460,6 +460,9 @@ definitions:
mimetype:
type: string
example: "application/pdf"
+ content_scope:
+ type: string
+ example: "issue"
release_ids:
type: array
items:
@@ -500,6 +503,9 @@ definitions:
type: object
properties:
<<: *ENTITYPROPS
+ content_scope:
+ type: string
+ example: "issue"
manifest:
# limit of 200 files, at least to start
type: array
@@ -601,6 +607,9 @@ definitions:
Same format as CDX line timestamp (UTC, etc). Corresponds to the
overall capture timestamp. Should generally be the timestamp of
capture of the primary resource URL.
+ content_scope:
+ type: string
+ example: "landing-page"
release_ids:
type: array
items:
diff --git a/guide/src/entity_file.md b/guide/src/entity_file.md
index 7429c982..84d9eac4 100644
--- a/guide/src/entity_file.md
+++ b/guide/src/entity_file.md
@@ -13,9 +13,13 @@
- `urls`: An array of "typed" URLs. Order is not meaningful, and may not be
preserved.
- `url` (string, required): Eg: "https://example.edu/~frau/prcding.pdf".
- - `rel` (string, required): Eg: "webarchive".
+ - `rel` (string, required): Eg: "webarchive", see vocabulary below.
- `mimetype` (string): Format of the file. If XML, specific schema can be
included after a `+`. Example: "application/pdf"
+- `content_scope` (string): for situations where the file does not simply
+ contain the full representation of a work (eg, fulltext of an article, for an
+ `article-journal` release), describes what that scope of coverage is. Eg,
+ entire `issue`, `corrupt` file. See vocabulary below.
- `release_ids` (array of string identifiers): references to `release` entities
that this file represents a manifestation of. Note that a single file can
contain multiple release references (eg, a PDF containing a full issue with
@@ -35,3 +39,37 @@
Scholar
- `dweb`: content hosted on distributed/decentralized web protocols, such as
`dat://` or `ipfs://` URLs
+
+#### `content_scope` Vocabulary
+
+This same vocabulary is shared between file, fileset, and webcapture entities;
+not all the fields make sense for each entity type.
+
+- if not set, assume that the artifact entity is valid and represents a
+ complete copy of the release
+- `issue`: artifact contains an entire issue of a serial publication (eg, issue
+ of a journal), representing several releases in full
+- `abstract`: contains only an abstract (short description) of the release, not
+ the release itself (unless the `release_type` itself is `abstract`, in which
+ case it is the entire release)
+- `index`: index of a journal, or series of abstracts from a conference
+- `slides`: slide deck (usually in "landscape" orientation)
+- `front-matter`: non-article content from a journal, such as editorial policies
+- `supplement`: usually a file entity which is a supplement or appendix, not
+ the entire work
+- `component`: a sub-component of a release, which may or may not be associated
+ with a `component` release entity. For example, a single figure or table as
+ part of an article
+- `poster`: digital copy of a poster, eg as displayed at conference poster sessions
+- `sample`: a partial sample of the entire work. eg, just the first page of an
+ article. distinct from `truncated`
+- `truncated`: the file has been truncated at a binary level, and may also be
+ corrupt or invalid. distinct from `sample`
+- `corrupt`: broken, mangled, or corrupt file (at the binary level)
+- `stub`: any other out-of-scope artifact situations, where the artifact
+ represents something which would not link to any possible in-scope release in
+ the catalog (except a `stub` release)
+- `landing-page`: for webcapture, the landing page of a work, as opposed to the
+ work itself
+- `spam`: content is spam. articles, webpages, or issues which include
+ incidental advertisements within them are not counted as `spam`
diff --git a/guide/src/entity_fileset.md b/guide/src/entity_fileset.md
index e1ac3e67..6083a09d 100644
--- a/guide/src/entity_fileset.md
+++ b/guide/src/entity_fileset.md
@@ -21,6 +21,10 @@
- `rel` (string, required):
Eg: "webarchive".
- `release_ids` (array of string identifiers): references to `release` entities
+- `content_scope` (string): for situations where the fileset does not simply
+ contain the full representation of a work (eg, all files in dataset, for a
+ `dataset` release), describes what that scope of coverage is. Uses same
+ vocabulary as File entity.
- `extra` (object with string keys): additional metadata about this group of
files, including upstream platform-specific metadata and identifiers
diff --git a/guide/src/entity_webcapture.md b/guide/src/entity_webcapture.md
index 8c5615fb..1b3cac55 100644
--- a/guide/src/entity_webcapture.md
+++ b/guide/src/entity_webcapture.md
@@ -29,4 +29,10 @@ Warning: This schema is not yet stable.
- `timestamp` (string, datetime): same format as CDX line timestamp (UTC, etc).
Corresponds to the overall capture timestamp. Can be the earliest of CDX
timestamps if that makes sense
+- `content_scope` (string): for situations where the webcapture does not simply
+ contain the full representation of a work (eg, HTML fulltext, for an
+ `article-journal` release), describes what that scope of coverage is. Eg,
+ `landing-page` it doesn't contain the full content. Landing pages are
+ out-of-scope for fatcat, but if they were accidentally imported, should mark
+ them as such so they aren't re-imported. Uses same vocabulary as File entity.
- `release_ids` (array of string identifiers): references to `release` entities
diff --git a/proposals/2021-11-17_content_scope.md b/proposals/2021-11-17_content_scope.md
new file mode 100644
index 00000000..8d04808e
--- /dev/null
+++ b/proposals/2021-11-17_content_scope.md
@@ -0,0 +1,84 @@
+
+status: planned
+
+Content Scope Fields
+======================
+
+Usually, "artifact" entities (file, fileset, webcapture) should not contain
+bibliographic metadata about their contents. For example, a file entity
+describing a PDF of a journal article should not indicate the publication
+stage, retraction status, publication type, journal ISSN, or other metadata
+about that article; the `release` entity should contain that information.
+Additionally, it is usually assumed that a single "artifact" entity is a
+complete representation of any associated release entities: the complete
+dataset, or complete article.
+
+This document describes a new metadata field to handle some special cases that
+go against this principle: the `content_scope` of a file, fileset, or
+webcapture. It is intended to be used when there is an exception to the
+assumption that a single "artifact" is a complete representation of a release.
+It is particularly useful when there is a problem with the artifact, resulting
+with it being disassociated with all releases.
+
+
+## Values
+
+This section will get copied to the guide.
+
+- if not set, assume that the artifact entity is valid and represents a
+ complete copy of the release
+- `issue`: artifact contains an entire issue of a serial publication (eg, issue
+ of a journal), representing several releases in full
+- `abstract`: contains only an abstract (short description) of the release, not
+ the release itself (unless the `release_type` itself is `abstract`, in which
+ case it is the entire release)
+- `index`: index of a journal, or series of abstracts from a conference (TODO:
+ separate value for conference abstract lists?)
+- `slides`: slide deck (usually in "landscape" orientation)
+- `front-matter`: non-article content from a journal, such as editorial policies
+- `supplement`: usually a file entity which is a supplement or appendix, not
+ the entire work
+- `component`: a sub-component of a release, which may or may not be associated
+ with a `component` release entity. For example, a single figure or table as
+ part of an article
+- `poster`: digital copy of a poster, eg as displayed at conference poster sessions
+- `sample`: a partial sample of the entire work. eg, just the first page of an
+ article. distinct from `truncated`
+- `truncated`: the file has been truncated at a binary level, and may also be
+ corrupt or invalid. distinct from `sample`
+- `corrupt`: broken, mangled, or corrupt file (at the binary level)
+- `stub`: any other out-of-scope artifact situations, where the artifact
+ represents something which would not link to any possible in-scope release in
+ the catalog (except a `stub` release)
+- `landing-page`: for webcapture, the landing page of a work, as opposed to the
+ work itself
+- `spam`: content is spam. articles, webpages, or issues which include
+ incidental advertisements within them are not counted as `spam`
+
+
+## Implementation
+
+The string field `content_scope` will be added to file, fileset, and webcapture
+entities.
+
+By default, this field does not need to be set. If it is empty, it can be
+assumed that the artifact represents an appropriate copy of the full release.
+If it is set, and the artifact is associated with one or more releases,
+downstream users/code may want to verify that the `content_scope` and
+`release_type` values are consistent. For example, `slides` is not consistent
+with `article-journal`, so such a file should be marked for review, and not
+considered a valid access option or preservation copy for the purposes of
+coverage analysis.
+
+
+## Removing Release Linkage
+
+In cases where the "artifact" entity is not an acceptable representation of any
+release (eg, truncation, corruption, spam), the entity should have the
+`release_ids` field cleared.
+
+Optionally, the new `extra` field `related_release_ids` can be used to indicate
+that an artifact entity has something to do with specific releases, but is not
+a full representation of them. This can be useful for corrupt or partial
+content to link to releases it is a partial representation of.
+
diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index a5c26410..2c7d08b0 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -262,7 +262,7 @@
},
"fatcat-openapi-client": {
"path": "./../python_openapi_client",
- "version": "==0.4.0"
+ "version": "==0.5.0"
},
"filelock": {
"hashes": [
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index d4962205..c16053ec 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -650,6 +650,7 @@ def file_to_elasticsearch(entity: FileEntity) -> Dict[str, Any]:
release_ids=entity.release_ids,
release_count=len(entity.release_ids),
mimetype=entity.mimetype,
+ content_scope=entity.content_scope,
size_bytes=entity.size,
sha1=entity.sha1,
sha256=entity.sha256,
diff --git a/python/tests/api_files.py b/python/tests/api_files.py
index 8f9caf3e..cd08eef7 100644
--- a/python/tests/api_files.py
+++ b/python/tests/api_files.py
@@ -13,6 +13,7 @@ def test_file(api):
sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2",
sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3",
mimetype="application/pdf",
+ content_scope="article",
urls=[
FileUrl(
url="https://web.archive.org/web/12345542/something.com/blah.pdf",
@@ -39,6 +40,7 @@ def test_file(api):
assert f1.sha1 == f2.sha1
assert f1.sha256 == f2.sha256
assert f1.mimetype == f2.mimetype
+ assert f1.content_scope == f2.content_scope
assert f1.extra == f2.extra
assert f1.urls == f2.urls
assert f1.release_ids == f2.release_ids
diff --git a/python/tests/api_filesets.py b/python/tests/api_filesets.py
index d7654eb9..8ab658f5 100644
--- a/python/tests/api_filesets.py
+++ b/python/tests/api_filesets.py
@@ -34,6 +34,7 @@ def test_fileset(api):
FilesetUrl(url="https://humble-host.com/~user123/dataset/", rel="web"),
],
release_ids=[r1edit.ident],
+ content_scope="dataset",
extra=dict(t=4, u=9),
edit_extra=dict(test_key="filesets rule"),
)
@@ -52,6 +53,7 @@ def test_fileset(api):
assert fs1.urls == fs2.urls
assert fs1.manifest == fs2.manifest
assert fs1.release_ids == fs2.release_ids
+ assert fs1.content_scope == fs2.content_scope
assert fs1.extra == fs2.extra
# expansion
diff --git a/python/tests/api_webcaptures.py b/python/tests/api_webcaptures.py
index 76bc68c0..6a477ff2 100644
--- a/python/tests/api_webcaptures.py
+++ b/python/tests/api_webcaptures.py
@@ -44,6 +44,7 @@ def test_webcapture(api):
FileUrl(rel="wayback", url="https://web.archive.org/web/"),
],
release_ids=[r1edit.ident],
+ content_scope="landing-page",
extra=dict(c=1, b=2),
edit_extra=dict(test_key="webcaptures rule"),
)
@@ -69,6 +70,7 @@ def test_webcapture(api):
assert wc1.release_ids == wc2.release_ids
assert wc1.timestamp == wc2.timestamp
assert wc1.original_url == wc2.original_url
+ assert wc1.content_scope == wc2.content_scope
assert wc1.extra == wc2.extra
# check release expansion
diff --git a/python_openapi_client/README.md b/python_openapi_client/README.md
index 8cc34147..316b9dfd 100644
--- a/python_openapi_client/README.md
+++ b/python_openapi_client/README.md
@@ -3,8 +3,8 @@ Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities
This Python package is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project:
-- API version: 0.4.0
-- Package version: 0.4.0
+- API version: 0.5.0
+- Package version: 0.5.0
- Build package: org.openapitools.codegen.languages.PythonClientCodegen
For more information, please visit [https://fatcat.wiki](https://fatcat.wiki)
diff --git a/python_openapi_client/codegen_python_client.sh b/python_openapi_client/codegen_python_client.sh
index bbf2ad83..17782c68 100755
--- a/python_openapi_client/codegen_python_client.sh
+++ b/python_openapi_client/codegen_python_client.sh
@@ -20,7 +20,7 @@ docker run \
--input-spec /tmp/swagger/api.yml \
--output /tmp/swagger/ \
--package-name=fatcat_openapi_client \
- -p packageVersion="0.4.0"
+ -p packageVersion="0.5.0"
sudo chown -R `whoami`:`whoami` $OUTPUT
mkdir -p fatcat_openapi_client
diff --git a/python_openapi_client/fatcat_openapi_client/__init__.py b/python_openapi_client/fatcat_openapi_client/__init__.py
index 5f1a7fba..8749264e 100644
--- a/python_openapi_client/fatcat_openapi_client/__init__.py
+++ b/python_openapi_client/fatcat_openapi_client/__init__.py
@@ -7,7 +7,7 @@
Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501
- The version of the OpenAPI document: 0.4.0
+ The version of the OpenAPI document: 0.5.0
Contact: webservices@archive.org
Generated by: https://openapi-generator.tech
"""
@@ -15,7 +15,7 @@
from __future__ import absolute_import
-__version__ = "0.4.0"
+__version__ = "0.5.0"
# import apis into sdk package
from fatcat_openapi_client.api.default_api import DefaultApi
diff --git a/python_openapi_client/fatcat_openapi_client/__version__.py b/python_openapi_client/fatcat_openapi_client/__version__.py
index 618922fe..b0634514 100644
--- a/python_openapi_client/fatcat_openapi_client/__version__.py
+++ b/python_openapi_client/fatcat_openapi_client/__version__.py
@@ -1,3 +1,3 @@
-VERSION = (0, 4, 0) # eg, (0, 2, '0dev0')
+VERSION = (0, 5, 0) # eg, (0, 2, '0dev0')
__version__ = '.'.join(map(str, VERSION))
diff --git a/python_openapi_client/fatcat_openapi_client/api_client.py b/python_openapi_client/fatcat_openapi_client/api_client.py
index efef8cbf..eb23f1cd 100644
--- a/python_openapi_client/fatcat_openapi_client/api_client.py
+++ b/python_openapi_client/fatcat_openapi_client/api_client.py
@@ -4,7 +4,7 @@
Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501
- The version of the OpenAPI document: 0.4.0
+ The version of the OpenAPI document: 0.5.0
Contact: webservices@archive.org
Generated by: https://openapi-generator.tech
"""
@@ -77,7 +77,7 @@ class ApiClient(object):
self.default_headers[header_name] = header_value
self.cookie = cookie
# Set default User-Agent.
- self.user_agent = 'OpenAPI-Generator/0.4.0/python'
+ self.user_agent = 'OpenAPI-Generator/0.5.0/python'
def __del__(self):
if self._pool:
diff --git a/python_openapi_client/fatcat_openapi_client/configuration.py b/python_openapi_client/fatcat_openapi_client/configuration.py
index c0e39620..dacf77c9 100644
--- a/python_openapi_client/fatcat_openapi_client/configuration.py
+++ b/python_openapi_client/fatcat_openapi_client/configuration.py
@@ -5,7 +5,7 @@
Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501
- The version of the OpenAPI document: 0.4.0
+ The version of the OpenAPI document: 0.5.0
Contact: webservices@archive.org
Generated by: https://openapi-generator.tech
"""
@@ -267,8 +267,8 @@ class Configuration(object):
return "Python SDK Debug Report:\n"\
"OS: {env}\n"\
"Python Version: {pyversion}\n"\
- "Version of the API: 0.4.0\n"\
- "SDK Package Version: 0.4.0".\
+ "Version of the API: 0.5.0\n"\
+ "SDK Package Version: 0.5.0".\
format(env=sys.platform, pyversion=sys.version)
def get_host_settings(self):
diff --git a/python_openapi_client/fatcat_openapi_client/models/file_entity.py b/python_openapi_client/fatcat_openapi_client/models/file_entity.py
index e52635a6..d97a0a03 100644
--- a/python_openapi_client/fatcat_openapi_client/models/file_entity.py
+++ b/python_openapi_client/fatcat_openapi_client/models/file_entity.py
@@ -5,7 +5,7 @@
Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501
- The version of the OpenAPI document: 0.3.1
+ The version of the OpenAPI document: 0.5.0
Contact: webservices@archive.org
Generated by: https://openapi-generator.tech
"""
@@ -44,6 +44,7 @@ class FileEntity(object):
'sha256': 'str',
'urls': 'list[FileUrl]',
'mimetype': 'str',
+ 'content_scope': 'str',
'release_ids': 'list[str]',
'releases': 'list[ReleaseEntity]'
}
@@ -61,11 +62,12 @@ class FileEntity(object):
'sha256': 'sha256',
'urls': 'urls',
'mimetype': 'mimetype',
+ 'content_scope': 'content_scope',
'release_ids': 'release_ids',
'releases': 'releases'
}
- def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, size=None, md5=None, sha1=None, sha256=None, urls=None, mimetype=None, release_ids=None, releases=None): # noqa: E501
+ def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, size=None, md5=None, sha1=None, sha256=None, urls=None, mimetype=None, content_scope=None, release_ids=None, releases=None): # noqa: E501
"""FileEntity - a model defined in OpenAPI""" # noqa: E501
self._state = None
@@ -80,6 +82,7 @@ class FileEntity(object):
self._sha256 = None
self._urls = None
self._mimetype = None
+ self._content_scope = None
self._release_ids = None
self._releases = None
self.discriminator = None
@@ -108,6 +111,8 @@ class FileEntity(object):
self.urls = urls
if mimetype is not None:
self.mimetype = mimetype
+ if content_scope is not None:
+ self.content_scope = content_scope
if release_ids is not None:
self.release_ids = release_ids
if releases is not None:
@@ -426,6 +431,27 @@ class FileEntity(object):
self._mimetype = mimetype
@property
+ def content_scope(self):
+ """Gets the content_scope of this FileEntity. # noqa: E501
+
+
+ :return: The content_scope of this FileEntity. # noqa: E501
+ :rtype: str
+ """
+ return self._content_scope
+
+ @content_scope.setter
+ def content_scope(self, content_scope):
+ """Sets the content_scope of this FileEntity.
+
+
+ :param content_scope: The content_scope of this FileEntity. # noqa: E501
+ :type: str
+ """
+
+ self._content_scope = content_scope
+
+ @property
def release_ids(self):
"""Gets the release_ids of this FileEntity. # noqa: E501
diff --git a/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py b/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py
index 51952d2a..dfc0787a 100644
--- a/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py
+++ b/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py
@@ -5,7 +5,7 @@
Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501
- The version of the OpenAPI document: 0.3.1
+ The version of the OpenAPI document: 0.5.0
Contact: webservices@archive.org
Generated by: https://openapi-generator.tech
"""
@@ -38,6 +38,7 @@ class FilesetEntity(object):
'redirect': 'str',
'extra': 'dict(str, object)',
'edit_extra': 'dict(str, object)',
+ 'content_scope': 'str',
'manifest': 'list[FilesetFile]',
'urls': 'list[FilesetUrl]',
'release_ids': 'list[str]',
@@ -51,13 +52,14 @@ class FilesetEntity(object):
'redirect': 'redirect',
'extra': 'extra',
'edit_extra': 'edit_extra',
+ 'content_scope': 'content_scope',
'manifest': 'manifest',
'urls': 'urls',
'release_ids': 'release_ids',
'releases': 'releases'
}
- def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, manifest=None, urls=None, release_ids=None, releases=None): # noqa: E501
+ def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, content_scope=None, manifest=None, urls=None, release_ids=None, releases=None): # noqa: E501
"""FilesetEntity - a model defined in OpenAPI""" # noqa: E501
self._state = None
@@ -66,6 +68,7 @@ class FilesetEntity(object):
self._redirect = None
self._extra = None
self._edit_extra = None
+ self._content_scope = None
self._manifest = None
self._urls = None
self._release_ids = None
@@ -84,6 +87,8 @@ class FilesetEntity(object):
self.extra = extra
if edit_extra is not None:
self.edit_extra = edit_extra
+ if content_scope is not None:
+ self.content_scope = content_scope
if manifest is not None:
self.manifest = manifest
if urls is not None:
@@ -254,6 +259,27 @@ class FilesetEntity(object):
self._edit_extra = edit_extra
@property
+ def content_scope(self):
+ """Gets the content_scope of this FilesetEntity. # noqa: E501
+
+
+ :return: The content_scope of this FilesetEntity. # noqa: E501
+ :rtype: str
+ """
+ return self._content_scope
+
+ @content_scope.setter
+ def content_scope(self, content_scope):
+ """Sets the content_scope of this FilesetEntity.
+
+
+ :param content_scope: The content_scope of this FilesetEntity. # noqa: E501
+ :type: str
+ """
+
+ self._content_scope = content_scope
+
+ @property
def manifest(self):
"""Gets the manifest of this FilesetEntity. # noqa: E501
diff --git a/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py b/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py
index 82363c23..968b0b1c 100644
--- a/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py
+++ b/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py
@@ -5,7 +5,7 @@
Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501
- The version of the OpenAPI document: 0.3.1
+ The version of the OpenAPI document: 0.5.0
Contact: webservices@archive.org
Generated by: https://openapi-generator.tech
"""
@@ -42,6 +42,7 @@ class WebcaptureEntity(object):
'archive_urls': 'list[WebcaptureUrl]',
'original_url': 'str',
'timestamp': 'datetime',
+ 'content_scope': 'str',
'release_ids': 'list[str]',
'releases': 'list[ReleaseEntity]'
}
@@ -57,11 +58,12 @@ class WebcaptureEntity(object):
'archive_urls': 'archive_urls',
'original_url': 'original_url',
'timestamp': 'timestamp',
+ 'content_scope': 'content_scope',
'release_ids': 'release_ids',
'releases': 'releases'
}
- def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, cdx=None, archive_urls=None, original_url=None, timestamp=None, release_ids=None, releases=None): # noqa: E501
+ def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, cdx=None, archive_urls=None, original_url=None, timestamp=None, content_scope=None, release_ids=None, releases=None): # noqa: E501
"""WebcaptureEntity - a model defined in OpenAPI""" # noqa: E501
self._state = None
@@ -74,6 +76,7 @@ class WebcaptureEntity(object):
self._archive_urls = None
self._original_url = None
self._timestamp = None
+ self._content_scope = None
self._release_ids = None
self._releases = None
self.discriminator = None
@@ -98,6 +101,8 @@ class WebcaptureEntity(object):
self.original_url = original_url
if timestamp is not None:
self.timestamp = timestamp
+ if content_scope is not None:
+ self.content_scope = content_scope
if release_ids is not None:
self.release_ids = release_ids
if releases is not None:
@@ -352,6 +357,27 @@ class WebcaptureEntity(object):
self._timestamp = timestamp
@property
+ def content_scope(self):
+ """Gets the content_scope of this WebcaptureEntity. # noqa: E501
+
+
+ :return: The content_scope of this WebcaptureEntity. # noqa: E501
+ :rtype: str
+ """
+ return self._content_scope
+
+ @content_scope.setter
+ def content_scope(self, content_scope):
+ """Sets the content_scope of this WebcaptureEntity.
+
+
+ :param content_scope: The content_scope of this WebcaptureEntity. # noqa: E501
+ :type: str
+ """
+
+ self._content_scope = content_scope
+
+ @property
def release_ids(self):
"""Gets the release_ids of this WebcaptureEntity. # noqa: E501
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
index 716f2b6b..6fa62c5f 100644
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -607,7 +607,7 @@ dependencies = [
[[package]]
name = "fatcat"
-version = "0.4.0"
+version = "0.5.0"
dependencies = [
"cadence",
"chrono 0.4.6",
@@ -648,7 +648,7 @@ dependencies = [
[[package]]
name = "fatcat-openapi"
-version = "0.4.0"
+version = "0.5.0"
dependencies = [
"bodyparser",
"chrono 0.4.6",
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index e2ce5e41..b521316e 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "fatcat"
-version = "0.4.0"
+version = "0.5.0"
edition = "2018"
authors = ["Bryan Newbold <bnewbold@archive.org>"]
description = "A scalable, versioned, API-oriented catalog for bibliographic entities and file metadata"
diff --git a/rust/fatcat-openapi/Cargo.toml b/rust/fatcat-openapi/Cargo.toml
index 7f417242..f532b780 100644
--- a/rust/fatcat-openapi/Cargo.toml
+++ b/rust/fatcat-openapi/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "fatcat-openapi"
-version = "0.4.0"
+version = "0.5.0"
edition = "2018"
authors = ["Bryan Newbold <bnewbold@archive.org>"]
description = "Fatcat is an editable bibliographic database. This OpenAPI code-generated crate container HTTP API models, endpoints, and other auto-generated types useful for both client and server implementations of the catalog API."
diff --git a/rust/fatcat-openapi/README.md b/rust/fatcat-openapi/README.md
index 327fe3a8..0ed53d3d 100644
--- a/rust/fatcat-openapi/README.md
+++ b/rust/fatcat-openapi/README.md
@@ -12,8 +12,8 @@ To see how to make this your own, look here:
[README](https://github.com/swagger-api/swagger-codegen/blob/master/README.md)
-- API version: 0.4.0
-- Build date: 2021-10-12T23:51:46.767Z
+- API version: 0.5.0
+- Build date: 2021-11-17T22:18:19.232Z
For more information, please visit [https://fatcat.wiki](https://fatcat.wiki)
This autogenerated project defines an API crate `fatcat` which contains:
@@ -172,7 +172,7 @@ The server example is designed to form the basis for implementing your own serve
* Set up a new Rust project, e.g., with `cargo init --bin`.
* Insert `fatcat` into the `members` array under [workspace] in the root `Cargo.toml`, e.g., `members = [ "fatcat" ]`.
-* Add `fatcat = {version = "0.4.0", path = "fatcat"}` under `[dependencies]` in the root `Cargo.toml`.
+* Add `fatcat = {version = "0.5.0", path = "fatcat"}` under `[dependencies]` in the root `Cargo.toml`.
* Copy the `[dependencies]` and `[dev-dependencies]` from `fatcat/Cargo.toml` into the root `Cargo.toml`'s `[dependencies]` section.
* Copy all of the `[dev-dependencies]`, but only the `[dependencies]` that are required by the example server. These should be clearly indicated by comments.
* Remove `"optional = true"` from each of these lines if present.
diff --git a/rust/fatcat-openapi/src/models.rs b/rust/fatcat-openapi/src/models.rs
index ca203c61..36c9105f 100644
--- a/rust/fatcat-openapi/src/models.rs
+++ b/rust/fatcat-openapi/src/models.rs
@@ -628,6 +628,10 @@ pub struct FileEntity {
#[serde(skip_serializing_if = "Option::is_none")]
pub release_ids: Option<Vec<String>>,
+ #[serde(rename = "content_scope")]
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub content_scope: Option<String>,
+
#[serde(rename = "mimetype")]
#[serde(skip_serializing_if = "Option::is_none")]
pub mimetype: Option<String>,
@@ -692,6 +696,7 @@ impl FileEntity {
FileEntity {
releases: None,
release_ids: None,
+ content_scope: None,
mimetype: None,
urls: None,
sha256: None,
@@ -763,6 +768,10 @@ pub struct FilesetEntity {
#[serde(skip_serializing_if = "Option::is_none")]
pub manifest: Option<Vec<models::FilesetFile>>,
+ #[serde(rename = "content_scope")]
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub content_scope: Option<String>,
+
// Note: inline enums are not fully supported by swagger-codegen
#[serde(rename = "state")]
#[serde(skip_serializing_if = "Option::is_none")]
@@ -801,6 +810,7 @@ impl FilesetEntity {
release_ids: None,
urls: None,
manifest: None,
+ content_scope: None,
state: None,
ident: None,
revision: None,
@@ -1453,6 +1463,10 @@ pub struct WebcaptureEntity {
#[serde(skip_serializing_if = "Option::is_none")]
pub release_ids: Option<Vec<String>>,
+ #[serde(rename = "content_scope")]
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub content_scope: Option<String>,
+
/// Same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Should generally be the timestamp of capture of the primary resource URL.
#[serde(rename = "timestamp")]
#[serde(skip_serializing_if = "Option::is_none")]
@@ -1507,6 +1521,7 @@ impl WebcaptureEntity {
WebcaptureEntity {
releases: None,
release_ids: None,
+ content_scope: None,
timestamp: None,
original_url: None,
archive_urls: None,
diff --git a/rust/migrations/2021-11-17-222046_content_scope/down.sql b/rust/migrations/2021-11-17-222046_content_scope/down.sql
new file mode 100644
index 00000000..b2d55321
--- /dev/null
+++ b/rust/migrations/2021-11-17-222046_content_scope/down.sql
@@ -0,0 +1,10 @@
+-- This file should undo anything in `up.sql`
+
+ALTER TABLE file_rev
+DROP COLUMN content_scope;
+
+ALTER TABLE fileset_rev
+DROP COLUMN content_scope;
+
+ALTER TABLE webcapture_rev
+DROP COLUMN content_scope;
diff --git a/rust/migrations/2021-11-17-222046_content_scope/up.sql b/rust/migrations/2021-11-17-222046_content_scope/up.sql
new file mode 100644
index 00000000..82c5f2e6
--- /dev/null
+++ b/rust/migrations/2021-11-17-222046_content_scope/up.sql
@@ -0,0 +1,27 @@
+-- This is the v0.5.0 schema
+-- Add `content_scope` field to file, fileset, webcapture
+
+ALTER TABLE file_rev
+ADD COLUMN content_scope TEXT CHECK (octet_length(content_scope) >= 1);
+
+ALTER TABLE fileset_rev
+ADD COLUMN content_scope TEXT CHECK (octet_length(content_scope) >= 1);
+
+ALTER TABLE webcapture_rev
+ADD COLUMN content_scope TEXT CHECK (octet_length(content_scope) >= 1);
+
+-------------------- Update Test Revs --------------------------------------
+-- IMPORTANT: don't create new entities here, only mutate existing
+
+BEGIN;
+
+UPDATE file_rev SET content_scope = 'article'
+WHERE id = '00000000-0000-0000-3333-FFF000000003';
+
+UPDATE fileset_rev SET content_scope = 'dataset'
+WHERE id = '00000000-0000-0000-6666-fff000000003';
+
+UPDATE webcapture_rev SET content_scope = 'webpage'
+WHERE id = '00000000-0000-0000-7777-FFF000000003';
+
+COMMIT;
diff --git a/rust/src/database_models.rs b/rust/src/database_models.rs
index 76c8675d..0427f9c8 100644
--- a/rust/src/database_models.rs
+++ b/rust/src/database_models.rs
@@ -220,6 +220,7 @@ pub struct FileRevRow {
pub sha256: Option<String>,
pub md5: Option<String>,
pub mimetype: Option<String>,
+ pub content_scope: Option<String>,
}
#[derive(Debug, Associations, AsChangeset, Insertable)]
@@ -231,6 +232,7 @@ pub struct FileRevNewRow {
pub sha256: Option<String>,
pub md5: Option<String>,
pub mimetype: Option<String>,
+ pub content_scope: Option<String>,
}
entity_structs!(
@@ -291,12 +293,14 @@ pub struct FilesetRevUrlNewRow {
pub struct FilesetRevRow {
pub id: Uuid,
pub extra_json: Option<serde_json::Value>,
+ pub content_scope: Option<String>,
}
#[derive(Debug, Associations, AsChangeset, Insertable)]
#[table_name = "fileset_rev"]
pub struct FilesetRevNewRow {
pub extra_json: Option<serde_json::Value>,
+ pub content_scope: Option<String>,
}
entity_structs!(
@@ -360,6 +364,7 @@ pub struct WebcaptureRevRow {
pub extra_json: Option<serde_json::Value>,
pub original_url: String,
pub timestamp: chrono::NaiveDateTime,
+ pub content_scope: Option<String>,
}
#[derive(Debug, Associations, AsChangeset, Insertable)]
@@ -368,6 +373,7 @@ pub struct WebcaptureRevNewRow {
pub extra_json: Option<serde_json::Value>,
pub original_url: String,
pub timestamp: chrono::NaiveDateTime,
+ pub content_scope: Option<String>,
}
entity_structs!(
diff --git a/rust/src/database_schema.rs b/rust/src/database_schema.rs
index e0a54233..e3d16202 100644
--- a/rust/src/database_schema.rs
+++ b/rust/src/database_schema.rs
@@ -163,6 +163,7 @@ table! {
sha256 -> Nullable<Text>,
md5 -> Nullable<Text>,
mimetype -> Nullable<Text>,
+ content_scope -> Nullable<Text>,
}
}
@@ -208,6 +209,7 @@ table! {
fileset_rev (id) {
id -> Uuid,
extra_json -> Nullable<Jsonb>,
+ content_scope -> Nullable<Text>,
}
}
@@ -372,6 +374,7 @@ table! {
extra_json -> Nullable<Jsonb>,
original_url -> Text,
timestamp -> Timestamptz,
+ content_scope -> Nullable<Text>,
}
}
diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs
index 19cb58ea..f48246a5 100644
--- a/rust/src/entity_crud.rs
+++ b/rust/src/entity_crud.rs
@@ -1042,6 +1042,7 @@ impl EntityCrud for FileEntity {
size: None,
urls: None,
mimetype: None,
+ content_scope: None,
release_ids: None,
releases: None,
state: Some(ident_row.state().unwrap().shortname()),
@@ -1125,6 +1126,7 @@ impl EntityCrud for FileEntity {
size: rev_row.size_bytes,
urls: Some(urls),
mimetype: rev_row.mimetype,
+ content_scope: rev_row.content_scope,
release_ids: Some(release_ids.iter().map(|fcid| fcid.to_string()).collect()),
releases: None,
state,
@@ -1160,6 +1162,7 @@ impl EntityCrud for FileEntity {
sha256: model.sha256.clone(),
md5: model.md5.clone(),
mimetype: model.mimetype.clone(),
+ content_scope: model.content_scope.clone(),
extra_json: model.extra.clone(),
})
.collect::<Vec<FileRevNewRow>>(),
@@ -1245,6 +1248,7 @@ impl EntityCrud for FilesetEntity {
}
Ok(FilesetEntity {
+ content_scope: None,
manifest: None,
urls: None,
release_ids: None,
@@ -1340,6 +1344,7 @@ impl EntityCrud for FilesetEntity {
.collect();
Ok(FilesetEntity {
+ content_scope: rev_row.content_scope.clone(),
manifest: Some(manifest),
urls: Some(urls),
release_ids: Some(release_ids.iter().map(|fcid| fcid.to_string()).collect()),
@@ -1376,6 +1381,7 @@ impl EntityCrud for FilesetEntity {
models
.iter()
.map(|model| FilesetRevNewRow {
+ content_scope: model.content_scope.clone(),
extra_json: model.extra.clone(),
})
.collect::<Vec<FilesetRevNewRow>>(),
@@ -1492,6 +1498,7 @@ impl EntityCrud for WebcaptureEntity {
archive_urls: None,
original_url: None,
timestamp: None,
+ content_scope: None,
release_ids: None,
releases: None,
state: Some(ident_row.state().unwrap().shortname()),
@@ -1590,6 +1597,7 @@ impl EntityCrud for WebcaptureEntity {
archive_urls: Some(archive_urls),
original_url: Some(rev_row.original_url),
timestamp: Some(chrono::DateTime::from_utc(rev_row.timestamp, chrono::Utc)),
+ content_scope: rev_row.content_scope,
release_ids: Some(release_ids.iter().map(|fcid| fcid.to_string()).collect()),
releases: None,
state,
@@ -1628,6 +1636,7 @@ impl EntityCrud for WebcaptureEntity {
// these unwraps safe because of check above
original_url: model.original_url.clone().unwrap(),
timestamp: model.timestamp.unwrap().naive_utc(),
+ content_scope: model.content_scope.clone(),
extra_json: model.extra.clone(),
})
.collect::<Vec<WebcaptureRevNewRow>>(),
diff --git a/rust/src/server.rs b/rust/src/server.rs
index 4d30ecbe..fb55f03c 100644
--- a/rust/src/server.rs
+++ b/rust/src/server.rs
@@ -72,6 +72,7 @@ pub fn create_test_server() -> Result<Server> {
diesel_migrations::revert_latest_migration(&conn).unwrap();
diesel_migrations::revert_latest_migration(&conn).unwrap();
diesel_migrations::revert_latest_migration(&conn).unwrap();
+ diesel_migrations::revert_latest_migration(&conn).unwrap();
diesel_migrations::run_pending_migrations(&conn).unwrap();
Ok(server)
}
diff --git a/rust/tests/test_api_server_http.rs b/rust/tests/test_api_server_http.rs
index 8f691e0d..0601a26b 100644
--- a/rust/tests/test_api_server_http.rs
+++ b/rust/tests/test_api_server_http.rs
@@ -636,6 +636,7 @@ fn test_post_file() {
{"url": "http://web.archive.org/2/http://archive.org/asdf.txt", "rel": "webarchive" }
],
"mimetype": "application/pdf",
+ "content_scope": "article",
"release_ids": [
"aaaaaaaaaaaaarceaaaaaaaaae",
"aaaaaaaaaaaaarceaaaaaaaaai"
@@ -711,6 +712,7 @@ fn test_post_fileset() {
"aaaaaaaaaaaaarceaaaaaaaaae",
"aaaaaaaaaaaaarceaaaaaaaaai"
],
+ "content_scope": "dataset",
"extra": { "source": "speculation" }
}"#,
&router,
@@ -764,6 +766,7 @@ fn test_post_webcapture() {
headers.clone(),
r#"{"original_url": "https://bnewbold.net/",
"timestamp": "2018-12-28T05:06:07Z",
+ "content_scope": "landing-page",
"cdx": [
{"surt": "org,asheesh,)/robots.txt",
"timestamp": "2018-12-28T05:06:07Z",