diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-22 16:12:01 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-22 16:12:01 -0800 |
commit | 5c7f50b2f497692493bfa54ad4741fdc573352ae (patch) | |
tree | c20cce1884076fffe210ba28e1a569f93ed22827 | |
parent | f3bd82c0308948a63645538bdd9511a503625499 (diff) | |
parent | dd00cec4164c1a1c31c8d9cffb92deb2e30b2211 (diff) | |
download | fatcat-5c7f50b2f497692493bfa54ad4741fdc573352ae.tar.gz fatcat-5c7f50b2f497692493bfa54ad4741fdc573352ae.zip |
Merge branch 'bnewbold-content-scope'
33 files changed, 346 insertions, 28 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index af5d96c4..4f080a17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,11 @@ See also: - [Semantic Versioning](https://semver.org/spec/v2.0.0.html) -## [UNRELEASED] +## [0.5.0] - UNRELEASED + +Small change to the API schema (and SQL schema), adding the `content_scope` +field to file, fileset, and webcapture entities. Because there is a SQL schema +change, bumping to version v0.5.0. An outward-facing change is that the fatcat API server now attempts to "stabilize" array order within JSON responses by sorting elements by @@ -28,6 +32,9 @@ was created. In particular, this may cause broad discrepencies compared to historical bulk metadata exports. New bulk exports will be generated with the new ordering. +A number of content cleanups and changes are also taking place to the primary +catalog (fatcat.wik), see the separate content CHANGELOG for details. + ### Fixed - API array order stablization, using `ORDER BY` in `fatcatd`. See note above. @@ -35,12 +42,22 @@ metadata exports. New bulk exports will be generated with the new ordering. ### Changed - broad python code style updates: formatting, lint rules, and type annotations +- a number of internal refactors of metadata importers +- stopped created a small number of Datacite-specific license slugs +- stopped trying to "fix" double slashes in DOIs, in most cases +- reduced amount of metadata stored in release `extra` field in Datacite + importer ### Added +- `content_scope` field on file, fileset, and webcapture entities - initial fileset importers - JSON pseudo-API for reference string match/get interface +### Removed + +- deleted deprecated `cdl_dash_dat` and `wayback_static` one-time importers + ## [0.4.0] - 2021-10-14 Includes small API and SQL schema changes; see diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index a8dbc6d0..34e2b0b3 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -37,6 +37,7 @@ "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false }, "release_count": { "type": "integer" }, "mimetype": { "type": "keyword", "normalizer": "default" }, + "content_scope": { "type": "keyword", "normalizer": "default" }, "size_bytes": { "type": "integer" }, "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false }, diff --git a/fatcat-openapi2.yml b/fatcat-openapi2.yml index 67915dda..7fafdb89 100644 --- a/fatcat-openapi2.yml +++ b/fatcat-openapi2.yml @@ -2,7 +2,7 @@ swagger: "2.0" info: title: fatcat - version: 0.4.0 + version: 0.5.0 description: | Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. @@ -460,6 +460,9 @@ definitions: mimetype: type: string example: "application/pdf" + content_scope: + type: string + example: "issue" release_ids: type: array items: @@ -500,6 +503,9 @@ definitions: type: object properties: <<: *ENTITYPROPS + content_scope: + type: string + example: "issue" manifest: # limit of 200 files, at least to start type: array @@ -601,6 +607,9 @@ definitions: Same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Should generally be the timestamp of capture of the primary resource URL. + content_scope: + type: string + example: "landing-page" release_ids: type: array items: diff --git a/guide/src/entity_file.md b/guide/src/entity_file.md index 7429c982..84d9eac4 100644 --- a/guide/src/entity_file.md +++ b/guide/src/entity_file.md @@ -13,9 +13,13 @@ - `urls`: An array of "typed" URLs. Order is not meaningful, and may not be preserved. - `url` (string, required): Eg: "https://example.edu/~frau/prcding.pdf". - - `rel` (string, required): Eg: "webarchive". + - `rel` (string, required): Eg: "webarchive", see vocabulary below. - `mimetype` (string): Format of the file. If XML, specific schema can be included after a `+`. Example: "application/pdf" +- `content_scope` (string): for situations where the file does not simply + contain the full representation of a work (eg, fulltext of an article, for an + `article-journal` release), describes what that scope of coverage is. Eg, + entire `issue`, `corrupt` file. See vocabulary below. - `release_ids` (array of string identifiers): references to `release` entities that this file represents a manifestation of. Note that a single file can contain multiple release references (eg, a PDF containing a full issue with @@ -35,3 +39,37 @@ Scholar - `dweb`: content hosted on distributed/decentralized web protocols, such as `dat://` or `ipfs://` URLs + +#### `content_scope` Vocabulary + +This same vocabulary is shared between file, fileset, and webcapture entities; +not all the fields make sense for each entity type. + +- if not set, assume that the artifact entity is valid and represents a + complete copy of the release +- `issue`: artifact contains an entire issue of a serial publication (eg, issue + of a journal), representing several releases in full +- `abstract`: contains only an abstract (short description) of the release, not + the release itself (unless the `release_type` itself is `abstract`, in which + case it is the entire release) +- `index`: index of a journal, or series of abstracts from a conference +- `slides`: slide deck (usually in "landscape" orientation) +- `front-matter`: non-article content from a journal, such as editorial policies +- `supplement`: usually a file entity which is a supplement or appendix, not + the entire work +- `component`: a sub-component of a release, which may or may not be associated + with a `component` release entity. For example, a single figure or table as + part of an article +- `poster`: digital copy of a poster, eg as displayed at conference poster sessions +- `sample`: a partial sample of the entire work. eg, just the first page of an + article. distinct from `truncated` +- `truncated`: the file has been truncated at a binary level, and may also be + corrupt or invalid. distinct from `sample` +- `corrupt`: broken, mangled, or corrupt file (at the binary level) +- `stub`: any other out-of-scope artifact situations, where the artifact + represents something which would not link to any possible in-scope release in + the catalog (except a `stub` release) +- `landing-page`: for webcapture, the landing page of a work, as opposed to the + work itself +- `spam`: content is spam. articles, webpages, or issues which include + incidental advertisements within them are not counted as `spam` diff --git a/guide/src/entity_fileset.md b/guide/src/entity_fileset.md index e1ac3e67..6083a09d 100644 --- a/guide/src/entity_fileset.md +++ b/guide/src/entity_fileset.md @@ -21,6 +21,10 @@ - `rel` (string, required): Eg: "webarchive". - `release_ids` (array of string identifiers): references to `release` entities +- `content_scope` (string): for situations where the fileset does not simply + contain the full representation of a work (eg, all files in dataset, for a + `dataset` release), describes what that scope of coverage is. Uses same + vocabulary as File entity. - `extra` (object with string keys): additional metadata about this group of files, including upstream platform-specific metadata and identifiers diff --git a/guide/src/entity_webcapture.md b/guide/src/entity_webcapture.md index 8c5615fb..1b3cac55 100644 --- a/guide/src/entity_webcapture.md +++ b/guide/src/entity_webcapture.md @@ -29,4 +29,10 @@ Warning: This schema is not yet stable. - `timestamp` (string, datetime): same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Can be the earliest of CDX timestamps if that makes sense +- `content_scope` (string): for situations where the webcapture does not simply + contain the full representation of a work (eg, HTML fulltext, for an + `article-journal` release), describes what that scope of coverage is. Eg, + `landing-page` it doesn't contain the full content. Landing pages are + out-of-scope for fatcat, but if they were accidentally imported, should mark + them as such so they aren't re-imported. Uses same vocabulary as File entity. - `release_ids` (array of string identifiers): references to `release` entities diff --git a/proposals/2021-11-17_content_scope.md b/proposals/2021-11-17_content_scope.md new file mode 100644 index 00000000..8d04808e --- /dev/null +++ b/proposals/2021-11-17_content_scope.md @@ -0,0 +1,84 @@ + +status: planned + +Content Scope Fields +====================== + +Usually, "artifact" entities (file, fileset, webcapture) should not contain +bibliographic metadata about their contents. For example, a file entity +describing a PDF of a journal article should not indicate the publication +stage, retraction status, publication type, journal ISSN, or other metadata +about that article; the `release` entity should contain that information. +Additionally, it is usually assumed that a single "artifact" entity is a +complete representation of any associated release entities: the complete +dataset, or complete article. + +This document describes a new metadata field to handle some special cases that +go against this principle: the `content_scope` of a file, fileset, or +webcapture. It is intended to be used when there is an exception to the +assumption that a single "artifact" is a complete representation of a release. +It is particularly useful when there is a problem with the artifact, resulting +with it being disassociated with all releases. + + +## Values + +This section will get copied to the guide. + +- if not set, assume that the artifact entity is valid and represents a + complete copy of the release +- `issue`: artifact contains an entire issue of a serial publication (eg, issue + of a journal), representing several releases in full +- `abstract`: contains only an abstract (short description) of the release, not + the release itself (unless the `release_type` itself is `abstract`, in which + case it is the entire release) +- `index`: index of a journal, or series of abstracts from a conference (TODO: + separate value for conference abstract lists?) +- `slides`: slide deck (usually in "landscape" orientation) +- `front-matter`: non-article content from a journal, such as editorial policies +- `supplement`: usually a file entity which is a supplement or appendix, not + the entire work +- `component`: a sub-component of a release, which may or may not be associated + with a `component` release entity. For example, a single figure or table as + part of an article +- `poster`: digital copy of a poster, eg as displayed at conference poster sessions +- `sample`: a partial sample of the entire work. eg, just the first page of an + article. distinct from `truncated` +- `truncated`: the file has been truncated at a binary level, and may also be + corrupt or invalid. distinct from `sample` +- `corrupt`: broken, mangled, or corrupt file (at the binary level) +- `stub`: any other out-of-scope artifact situations, where the artifact + represents something which would not link to any possible in-scope release in + the catalog (except a `stub` release) +- `landing-page`: for webcapture, the landing page of a work, as opposed to the + work itself +- `spam`: content is spam. articles, webpages, or issues which include + incidental advertisements within them are not counted as `spam` + + +## Implementation + +The string field `content_scope` will be added to file, fileset, and webcapture +entities. + +By default, this field does not need to be set. If it is empty, it can be +assumed that the artifact represents an appropriate copy of the full release. +If it is set, and the artifact is associated with one or more releases, +downstream users/code may want to verify that the `content_scope` and +`release_type` values are consistent. For example, `slides` is not consistent +with `article-journal`, so such a file should be marked for review, and not +considered a valid access option or preservation copy for the purposes of +coverage analysis. + + +## Removing Release Linkage + +In cases where the "artifact" entity is not an acceptable representation of any +release (eg, truncation, corruption, spam), the entity should have the +`release_ids` field cleared. + +Optionally, the new `extra` field `related_release_ids` can be used to indicate +that an artifact entity has something to do with specific releases, but is not +a full representation of them. This can be useful for corrupt or partial +content to link to releases it is a partial representation of. + diff --git a/python/Pipfile.lock b/python/Pipfile.lock index a5c26410..2c7d08b0 100644 --- a/python/Pipfile.lock +++ b/python/Pipfile.lock @@ -262,7 +262,7 @@ }, "fatcat-openapi-client": { "path": "./../python_openapi_client", - "version": "==0.4.0" + "version": "==0.5.0" }, "filelock": { "hashes": [ diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index d4962205..c16053ec 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -650,6 +650,7 @@ def file_to_elasticsearch(entity: FileEntity) -> Dict[str, Any]: release_ids=entity.release_ids, release_count=len(entity.release_ids), mimetype=entity.mimetype, + content_scope=entity.content_scope, size_bytes=entity.size, sha1=entity.sha1, sha256=entity.sha256, diff --git a/python/tests/api_files.py b/python/tests/api_files.py index 8f9caf3e..cd08eef7 100644 --- a/python/tests/api_files.py +++ b/python/tests/api_files.py @@ -13,6 +13,7 @@ def test_file(api): sha1="027e7ed3ea1a40e92dd2657a1e3c992b5dc45dd2", sha256="f1f4f18a904e76818863ccbc6141fce92b0dcb47b0d6041aec98bc6806e393c3", mimetype="application/pdf", + content_scope="article", urls=[ FileUrl( url="https://web.archive.org/web/12345542/something.com/blah.pdf", @@ -39,6 +40,7 @@ def test_file(api): assert f1.sha1 == f2.sha1 assert f1.sha256 == f2.sha256 assert f1.mimetype == f2.mimetype + assert f1.content_scope == f2.content_scope assert f1.extra == f2.extra assert f1.urls == f2.urls assert f1.release_ids == f2.release_ids diff --git a/python/tests/api_filesets.py b/python/tests/api_filesets.py index d7654eb9..8ab658f5 100644 --- a/python/tests/api_filesets.py +++ b/python/tests/api_filesets.py @@ -34,6 +34,7 @@ def test_fileset(api): FilesetUrl(url="https://humble-host.com/~user123/dataset/", rel="web"), ], release_ids=[r1edit.ident], + content_scope="dataset", extra=dict(t=4, u=9), edit_extra=dict(test_key="filesets rule"), ) @@ -52,6 +53,7 @@ def test_fileset(api): assert fs1.urls == fs2.urls assert fs1.manifest == fs2.manifest assert fs1.release_ids == fs2.release_ids + assert fs1.content_scope == fs2.content_scope assert fs1.extra == fs2.extra # expansion diff --git a/python/tests/api_webcaptures.py b/python/tests/api_webcaptures.py index 76bc68c0..6a477ff2 100644 --- a/python/tests/api_webcaptures.py +++ b/python/tests/api_webcaptures.py @@ -44,6 +44,7 @@ def test_webcapture(api): FileUrl(rel="wayback", url="https://web.archive.org/web/"), ], release_ids=[r1edit.ident], + content_scope="landing-page", extra=dict(c=1, b=2), edit_extra=dict(test_key="webcaptures rule"), ) @@ -69,6 +70,7 @@ def test_webcapture(api): assert wc1.release_ids == wc2.release_ids assert wc1.timestamp == wc2.timestamp assert wc1.original_url == wc2.original_url + assert wc1.content_scope == wc2.content_scope assert wc1.extra == wc2.extra # check release expansion diff --git a/python_openapi_client/README.md b/python_openapi_client/README.md index 8cc34147..316b9dfd 100644 --- a/python_openapi_client/README.md +++ b/python_openapi_client/README.md @@ -3,8 +3,8 @@ Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities This Python package is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project: -- API version: 0.4.0 -- Package version: 0.4.0 +- API version: 0.5.0 +- Package version: 0.5.0 - Build package: org.openapitools.codegen.languages.PythonClientCodegen For more information, please visit [https://fatcat.wiki](https://fatcat.wiki) diff --git a/python_openapi_client/codegen_python_client.sh b/python_openapi_client/codegen_python_client.sh index bbf2ad83..17782c68 100755 --- a/python_openapi_client/codegen_python_client.sh +++ b/python_openapi_client/codegen_python_client.sh @@ -20,7 +20,7 @@ docker run \ --input-spec /tmp/swagger/api.yml \ --output /tmp/swagger/ \ --package-name=fatcat_openapi_client \ - -p packageVersion="0.4.0" + -p packageVersion="0.5.0" sudo chown -R `whoami`:`whoami` $OUTPUT mkdir -p fatcat_openapi_client diff --git a/python_openapi_client/fatcat_openapi_client/__init__.py b/python_openapi_client/fatcat_openapi_client/__init__.py index 5f1a7fba..8749264e 100644 --- a/python_openapi_client/fatcat_openapi_client/__init__.py +++ b/python_openapi_client/fatcat_openapi_client/__init__.py @@ -7,7 +7,7 @@ Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501 - The version of the OpenAPI document: 0.4.0 + The version of the OpenAPI document: 0.5.0 Contact: webservices@archive.org Generated by: https://openapi-generator.tech """ @@ -15,7 +15,7 @@ from __future__ import absolute_import -__version__ = "0.4.0" +__version__ = "0.5.0" # import apis into sdk package from fatcat_openapi_client.api.default_api import DefaultApi diff --git a/python_openapi_client/fatcat_openapi_client/__version__.py b/python_openapi_client/fatcat_openapi_client/__version__.py index 618922fe..b0634514 100644 --- a/python_openapi_client/fatcat_openapi_client/__version__.py +++ b/python_openapi_client/fatcat_openapi_client/__version__.py @@ -1,3 +1,3 @@ -VERSION = (0, 4, 0) # eg, (0, 2, '0dev0') +VERSION = (0, 5, 0) # eg, (0, 2, '0dev0') __version__ = '.'.join(map(str, VERSION)) diff --git a/python_openapi_client/fatcat_openapi_client/api_client.py b/python_openapi_client/fatcat_openapi_client/api_client.py index efef8cbf..eb23f1cd 100644 --- a/python_openapi_client/fatcat_openapi_client/api_client.py +++ b/python_openapi_client/fatcat_openapi_client/api_client.py @@ -4,7 +4,7 @@ Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501 - The version of the OpenAPI document: 0.4.0 + The version of the OpenAPI document: 0.5.0 Contact: webservices@archive.org Generated by: https://openapi-generator.tech """ @@ -77,7 +77,7 @@ class ApiClient(object): self.default_headers[header_name] = header_value self.cookie = cookie # Set default User-Agent. - self.user_agent = 'OpenAPI-Generator/0.4.0/python' + self.user_agent = 'OpenAPI-Generator/0.5.0/python' def __del__(self): if self._pool: diff --git a/python_openapi_client/fatcat_openapi_client/configuration.py b/python_openapi_client/fatcat_openapi_client/configuration.py index c0e39620..dacf77c9 100644 --- a/python_openapi_client/fatcat_openapi_client/configuration.py +++ b/python_openapi_client/fatcat_openapi_client/configuration.py @@ -5,7 +5,7 @@ Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501 - The version of the OpenAPI document: 0.4.0 + The version of the OpenAPI document: 0.5.0 Contact: webservices@archive.org Generated by: https://openapi-generator.tech """ @@ -267,8 +267,8 @@ class Configuration(object): return "Python SDK Debug Report:\n"\ "OS: {env}\n"\ "Python Version: {pyversion}\n"\ - "Version of the API: 0.4.0\n"\ - "SDK Package Version: 0.4.0".\ + "Version of the API: 0.5.0\n"\ + "SDK Package Version: 0.5.0".\ format(env=sys.platform, pyversion=sys.version) def get_host_settings(self): diff --git a/python_openapi_client/fatcat_openapi_client/models/file_entity.py b/python_openapi_client/fatcat_openapi_client/models/file_entity.py index e52635a6..d97a0a03 100644 --- a/python_openapi_client/fatcat_openapi_client/models/file_entity.py +++ b/python_openapi_client/fatcat_openapi_client/models/file_entity.py @@ -5,7 +5,7 @@ Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501 - The version of the OpenAPI document: 0.3.1 + The version of the OpenAPI document: 0.5.0 Contact: webservices@archive.org Generated by: https://openapi-generator.tech """ @@ -44,6 +44,7 @@ class FileEntity(object): 'sha256': 'str', 'urls': 'list[FileUrl]', 'mimetype': 'str', + 'content_scope': 'str', 'release_ids': 'list[str]', 'releases': 'list[ReleaseEntity]' } @@ -61,11 +62,12 @@ class FileEntity(object): 'sha256': 'sha256', 'urls': 'urls', 'mimetype': 'mimetype', + 'content_scope': 'content_scope', 'release_ids': 'release_ids', 'releases': 'releases' } - def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, size=None, md5=None, sha1=None, sha256=None, urls=None, mimetype=None, release_ids=None, releases=None): # noqa: E501 + def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, size=None, md5=None, sha1=None, sha256=None, urls=None, mimetype=None, content_scope=None, release_ids=None, releases=None): # noqa: E501 """FileEntity - a model defined in OpenAPI""" # noqa: E501 self._state = None @@ -80,6 +82,7 @@ class FileEntity(object): self._sha256 = None self._urls = None self._mimetype = None + self._content_scope = None self._release_ids = None self._releases = None self.discriminator = None @@ -108,6 +111,8 @@ class FileEntity(object): self.urls = urls if mimetype is not None: self.mimetype = mimetype + if content_scope is not None: + self.content_scope = content_scope if release_ids is not None: self.release_ids = release_ids if releases is not None: @@ -426,6 +431,27 @@ class FileEntity(object): self._mimetype = mimetype @property + def content_scope(self): + """Gets the content_scope of this FileEntity. # noqa: E501 + + + :return: The content_scope of this FileEntity. # noqa: E501 + :rtype: str + """ + return self._content_scope + + @content_scope.setter + def content_scope(self, content_scope): + """Sets the content_scope of this FileEntity. + + + :param content_scope: The content_scope of this FileEntity. # noqa: E501 + :type: str + """ + + self._content_scope = content_scope + + @property def release_ids(self): """Gets the release_ids of this FileEntity. # noqa: E501 diff --git a/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py b/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py index 51952d2a..dfc0787a 100644 --- a/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py +++ b/python_openapi_client/fatcat_openapi_client/models/fileset_entity.py @@ -5,7 +5,7 @@ Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501 - The version of the OpenAPI document: 0.3.1 + The version of the OpenAPI document: 0.5.0 Contact: webservices@archive.org Generated by: https://openapi-generator.tech """ @@ -38,6 +38,7 @@ class FilesetEntity(object): 'redirect': 'str', 'extra': 'dict(str, object)', 'edit_extra': 'dict(str, object)', + 'content_scope': 'str', 'manifest': 'list[FilesetFile]', 'urls': 'list[FilesetUrl]', 'release_ids': 'list[str]', @@ -51,13 +52,14 @@ class FilesetEntity(object): 'redirect': 'redirect', 'extra': 'extra', 'edit_extra': 'edit_extra', + 'content_scope': 'content_scope', 'manifest': 'manifest', 'urls': 'urls', 'release_ids': 'release_ids', 'releases': 'releases' } - def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, manifest=None, urls=None, release_ids=None, releases=None): # noqa: E501 + def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, content_scope=None, manifest=None, urls=None, release_ids=None, releases=None): # noqa: E501 """FilesetEntity - a model defined in OpenAPI""" # noqa: E501 self._state = None @@ -66,6 +68,7 @@ class FilesetEntity(object): self._redirect = None self._extra = None self._edit_extra = None + self._content_scope = None self._manifest = None self._urls = None self._release_ids = None @@ -84,6 +87,8 @@ class FilesetEntity(object): self.extra = extra if edit_extra is not None: self.edit_extra = edit_extra + if content_scope is not None: + self.content_scope = content_scope if manifest is not None: self.manifest = manifest if urls is not None: @@ -254,6 +259,27 @@ class FilesetEntity(object): self._edit_extra = edit_extra @property + def content_scope(self): + """Gets the content_scope of this FilesetEntity. # noqa: E501 + + + :return: The content_scope of this FilesetEntity. # noqa: E501 + :rtype: str + """ + return self._content_scope + + @content_scope.setter + def content_scope(self, content_scope): + """Sets the content_scope of this FilesetEntity. + + + :param content_scope: The content_scope of this FilesetEntity. # noqa: E501 + :type: str + """ + + self._content_scope = content_scope + + @property def manifest(self): """Gets the manifest of this FilesetEntity. # noqa: E501 diff --git a/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py b/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py index 82363c23..968b0b1c 100644 --- a/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py +++ b/python_openapi_client/fatcat_openapi_client/models/webcapture_entity.py @@ -5,7 +5,7 @@ Fatcat is a scalable, versioned, API-oriented catalog of bibliographic entities and file metadata. # noqa: E501 - The version of the OpenAPI document: 0.3.1 + The version of the OpenAPI document: 0.5.0 Contact: webservices@archive.org Generated by: https://openapi-generator.tech """ @@ -42,6 +42,7 @@ class WebcaptureEntity(object): 'archive_urls': 'list[WebcaptureUrl]', 'original_url': 'str', 'timestamp': 'datetime', + 'content_scope': 'str', 'release_ids': 'list[str]', 'releases': 'list[ReleaseEntity]' } @@ -57,11 +58,12 @@ class WebcaptureEntity(object): 'archive_urls': 'archive_urls', 'original_url': 'original_url', 'timestamp': 'timestamp', + 'content_scope': 'content_scope', 'release_ids': 'release_ids', 'releases': 'releases' } - def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, cdx=None, archive_urls=None, original_url=None, timestamp=None, release_ids=None, releases=None): # noqa: E501 + def __init__(self, state=None, ident=None, revision=None, redirect=None, extra=None, edit_extra=None, cdx=None, archive_urls=None, original_url=None, timestamp=None, content_scope=None, release_ids=None, releases=None): # noqa: E501 """WebcaptureEntity - a model defined in OpenAPI""" # noqa: E501 self._state = None @@ -74,6 +76,7 @@ class WebcaptureEntity(object): self._archive_urls = None self._original_url = None self._timestamp = None + self._content_scope = None self._release_ids = None self._releases = None self.discriminator = None @@ -98,6 +101,8 @@ class WebcaptureEntity(object): self.original_url = original_url if timestamp is not None: self.timestamp = timestamp + if content_scope is not None: + self.content_scope = content_scope if release_ids is not None: self.release_ids = release_ids if releases is not None: @@ -352,6 +357,27 @@ class WebcaptureEntity(object): self._timestamp = timestamp @property + def content_scope(self): + """Gets the content_scope of this WebcaptureEntity. # noqa: E501 + + + :return: The content_scope of this WebcaptureEntity. # noqa: E501 + :rtype: str + """ + return self._content_scope + + @content_scope.setter + def content_scope(self, content_scope): + """Sets the content_scope of this WebcaptureEntity. + + + :param content_scope: The content_scope of this WebcaptureEntity. # noqa: E501 + :type: str + """ + + self._content_scope = content_scope + + @property def release_ids(self): """Gets the release_ids of this WebcaptureEntity. # noqa: E501 diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 716f2b6b..6fa62c5f 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -607,7 +607,7 @@ dependencies = [ [[package]] name = "fatcat" -version = "0.4.0" +version = "0.5.0" dependencies = [ "cadence", "chrono 0.4.6", @@ -648,7 +648,7 @@ dependencies = [ [[package]] name = "fatcat-openapi" -version = "0.4.0" +version = "0.5.0" dependencies = [ "bodyparser", "chrono 0.4.6", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index e2ce5e41..b521316e 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fatcat" -version = "0.4.0" +version = "0.5.0" edition = "2018" authors = ["Bryan Newbold <bnewbold@archive.org>"] description = "A scalable, versioned, API-oriented catalog for bibliographic entities and file metadata" diff --git a/rust/fatcat-openapi/Cargo.toml b/rust/fatcat-openapi/Cargo.toml index 7f417242..f532b780 100644 --- a/rust/fatcat-openapi/Cargo.toml +++ b/rust/fatcat-openapi/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fatcat-openapi" -version = "0.4.0" +version = "0.5.0" edition = "2018" authors = ["Bryan Newbold <bnewbold@archive.org>"] description = "Fatcat is an editable bibliographic database. This OpenAPI code-generated crate container HTTP API models, endpoints, and other auto-generated types useful for both client and server implementations of the catalog API." diff --git a/rust/fatcat-openapi/README.md b/rust/fatcat-openapi/README.md index 327fe3a8..0ed53d3d 100644 --- a/rust/fatcat-openapi/README.md +++ b/rust/fatcat-openapi/README.md @@ -12,8 +12,8 @@ To see how to make this your own, look here: [README](https://github.com/swagger-api/swagger-codegen/blob/master/README.md) -- API version: 0.4.0 -- Build date: 2021-10-12T23:51:46.767Z +- API version: 0.5.0 +- Build date: 2021-11-17T22:18:19.232Z For more information, please visit [https://fatcat.wiki](https://fatcat.wiki) This autogenerated project defines an API crate `fatcat` which contains: @@ -172,7 +172,7 @@ The server example is designed to form the basis for implementing your own serve * Set up a new Rust project, e.g., with `cargo init --bin`. * Insert `fatcat` into the `members` array under [workspace] in the root `Cargo.toml`, e.g., `members = [ "fatcat" ]`. -* Add `fatcat = {version = "0.4.0", path = "fatcat"}` under `[dependencies]` in the root `Cargo.toml`. +* Add `fatcat = {version = "0.5.0", path = "fatcat"}` under `[dependencies]` in the root `Cargo.toml`. * Copy the `[dependencies]` and `[dev-dependencies]` from `fatcat/Cargo.toml` into the root `Cargo.toml`'s `[dependencies]` section. * Copy all of the `[dev-dependencies]`, but only the `[dependencies]` that are required by the example server. These should be clearly indicated by comments. * Remove `"optional = true"` from each of these lines if present. diff --git a/rust/fatcat-openapi/src/models.rs b/rust/fatcat-openapi/src/models.rs index ca203c61..36c9105f 100644 --- a/rust/fatcat-openapi/src/models.rs +++ b/rust/fatcat-openapi/src/models.rs @@ -628,6 +628,10 @@ pub struct FileEntity { #[serde(skip_serializing_if = "Option::is_none")] pub release_ids: Option<Vec<String>>, + #[serde(rename = "content_scope")] + #[serde(skip_serializing_if = "Option::is_none")] + pub content_scope: Option<String>, + #[serde(rename = "mimetype")] #[serde(skip_serializing_if = "Option::is_none")] pub mimetype: Option<String>, @@ -692,6 +696,7 @@ impl FileEntity { FileEntity { releases: None, release_ids: None, + content_scope: None, mimetype: None, urls: None, sha256: None, @@ -763,6 +768,10 @@ pub struct FilesetEntity { #[serde(skip_serializing_if = "Option::is_none")] pub manifest: Option<Vec<models::FilesetFile>>, + #[serde(rename = "content_scope")] + #[serde(skip_serializing_if = "Option::is_none")] + pub content_scope: Option<String>, + // Note: inline enums are not fully supported by swagger-codegen #[serde(rename = "state")] #[serde(skip_serializing_if = "Option::is_none")] @@ -801,6 +810,7 @@ impl FilesetEntity { release_ids: None, urls: None, manifest: None, + content_scope: None, state: None, ident: None, revision: None, @@ -1453,6 +1463,10 @@ pub struct WebcaptureEntity { #[serde(skip_serializing_if = "Option::is_none")] pub release_ids: Option<Vec<String>>, + #[serde(rename = "content_scope")] + #[serde(skip_serializing_if = "Option::is_none")] + pub content_scope: Option<String>, + /// Same format as CDX line timestamp (UTC, etc). Corresponds to the overall capture timestamp. Should generally be the timestamp of capture of the primary resource URL. #[serde(rename = "timestamp")] #[serde(skip_serializing_if = "Option::is_none")] @@ -1507,6 +1521,7 @@ impl WebcaptureEntity { WebcaptureEntity { releases: None, release_ids: None, + content_scope: None, timestamp: None, original_url: None, archive_urls: None, diff --git a/rust/migrations/2021-11-17-222046_content_scope/down.sql b/rust/migrations/2021-11-17-222046_content_scope/down.sql new file mode 100644 index 00000000..b2d55321 --- /dev/null +++ b/rust/migrations/2021-11-17-222046_content_scope/down.sql @@ -0,0 +1,10 @@ +-- This file should undo anything in `up.sql` + +ALTER TABLE file_rev +DROP COLUMN content_scope; + +ALTER TABLE fileset_rev +DROP COLUMN content_scope; + +ALTER TABLE webcapture_rev +DROP COLUMN content_scope; diff --git a/rust/migrations/2021-11-17-222046_content_scope/up.sql b/rust/migrations/2021-11-17-222046_content_scope/up.sql new file mode 100644 index 00000000..82c5f2e6 --- /dev/null +++ b/rust/migrations/2021-11-17-222046_content_scope/up.sql @@ -0,0 +1,27 @@ +-- This is the v0.5.0 schema +-- Add `content_scope` field to file, fileset, webcapture + +ALTER TABLE file_rev +ADD COLUMN content_scope TEXT CHECK (octet_length(content_scope) >= 1); + +ALTER TABLE fileset_rev +ADD COLUMN content_scope TEXT CHECK (octet_length(content_scope) >= 1); + +ALTER TABLE webcapture_rev +ADD COLUMN content_scope TEXT CHECK (octet_length(content_scope) >= 1); + +-------------------- Update Test Revs -------------------------------------- +-- IMPORTANT: don't create new entities here, only mutate existing + +BEGIN; + +UPDATE file_rev SET content_scope = 'article' +WHERE id = '00000000-0000-0000-3333-FFF000000003'; + +UPDATE fileset_rev SET content_scope = 'dataset' +WHERE id = '00000000-0000-0000-6666-fff000000003'; + +UPDATE webcapture_rev SET content_scope = 'webpage' +WHERE id = '00000000-0000-0000-7777-FFF000000003'; + +COMMIT; diff --git a/rust/src/database_models.rs b/rust/src/database_models.rs index 76c8675d..0427f9c8 100644 --- a/rust/src/database_models.rs +++ b/rust/src/database_models.rs @@ -220,6 +220,7 @@ pub struct FileRevRow { pub sha256: Option<String>, pub md5: Option<String>, pub mimetype: Option<String>, + pub content_scope: Option<String>, } #[derive(Debug, Associations, AsChangeset, Insertable)] @@ -231,6 +232,7 @@ pub struct FileRevNewRow { pub sha256: Option<String>, pub md5: Option<String>, pub mimetype: Option<String>, + pub content_scope: Option<String>, } entity_structs!( @@ -291,12 +293,14 @@ pub struct FilesetRevUrlNewRow { pub struct FilesetRevRow { pub id: Uuid, pub extra_json: Option<serde_json::Value>, + pub content_scope: Option<String>, } #[derive(Debug, Associations, AsChangeset, Insertable)] #[table_name = "fileset_rev"] pub struct FilesetRevNewRow { pub extra_json: Option<serde_json::Value>, + pub content_scope: Option<String>, } entity_structs!( @@ -360,6 +364,7 @@ pub struct WebcaptureRevRow { pub extra_json: Option<serde_json::Value>, pub original_url: String, pub timestamp: chrono::NaiveDateTime, + pub content_scope: Option<String>, } #[derive(Debug, Associations, AsChangeset, Insertable)] @@ -368,6 +373,7 @@ pub struct WebcaptureRevNewRow { pub extra_json: Option<serde_json::Value>, pub original_url: String, pub timestamp: chrono::NaiveDateTime, + pub content_scope: Option<String>, } entity_structs!( diff --git a/rust/src/database_schema.rs b/rust/src/database_schema.rs index e0a54233..e3d16202 100644 --- a/rust/src/database_schema.rs +++ b/rust/src/database_schema.rs @@ -163,6 +163,7 @@ table! { sha256 -> Nullable<Text>, md5 -> Nullable<Text>, mimetype -> Nullable<Text>, + content_scope -> Nullable<Text>, } } @@ -208,6 +209,7 @@ table! { fileset_rev (id) { id -> Uuid, extra_json -> Nullable<Jsonb>, + content_scope -> Nullable<Text>, } } @@ -372,6 +374,7 @@ table! { extra_json -> Nullable<Jsonb>, original_url -> Text, timestamp -> Timestamptz, + content_scope -> Nullable<Text>, } } diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs index 19cb58ea..f48246a5 100644 --- a/rust/src/entity_crud.rs +++ b/rust/src/entity_crud.rs @@ -1042,6 +1042,7 @@ impl EntityCrud for FileEntity { size: None, urls: None, mimetype: None, + content_scope: None, release_ids: None, releases: None, state: Some(ident_row.state().unwrap().shortname()), @@ -1125,6 +1126,7 @@ impl EntityCrud for FileEntity { size: rev_row.size_bytes, urls: Some(urls), mimetype: rev_row.mimetype, + content_scope: rev_row.content_scope, release_ids: Some(release_ids.iter().map(|fcid| fcid.to_string()).collect()), releases: None, state, @@ -1160,6 +1162,7 @@ impl EntityCrud for FileEntity { sha256: model.sha256.clone(), md5: model.md5.clone(), mimetype: model.mimetype.clone(), + content_scope: model.content_scope.clone(), extra_json: model.extra.clone(), }) .collect::<Vec<FileRevNewRow>>(), @@ -1245,6 +1248,7 @@ impl EntityCrud for FilesetEntity { } Ok(FilesetEntity { + content_scope: None, manifest: None, urls: None, release_ids: None, @@ -1340,6 +1344,7 @@ impl EntityCrud for FilesetEntity { .collect(); Ok(FilesetEntity { + content_scope: rev_row.content_scope.clone(), manifest: Some(manifest), urls: Some(urls), release_ids: Some(release_ids.iter().map(|fcid| fcid.to_string()).collect()), @@ -1376,6 +1381,7 @@ impl EntityCrud for FilesetEntity { models .iter() .map(|model| FilesetRevNewRow { + content_scope: model.content_scope.clone(), extra_json: model.extra.clone(), }) .collect::<Vec<FilesetRevNewRow>>(), @@ -1492,6 +1498,7 @@ impl EntityCrud for WebcaptureEntity { archive_urls: None, original_url: None, timestamp: None, + content_scope: None, release_ids: None, releases: None, state: Some(ident_row.state().unwrap().shortname()), @@ -1590,6 +1597,7 @@ impl EntityCrud for WebcaptureEntity { archive_urls: Some(archive_urls), original_url: Some(rev_row.original_url), timestamp: Some(chrono::DateTime::from_utc(rev_row.timestamp, chrono::Utc)), + content_scope: rev_row.content_scope, release_ids: Some(release_ids.iter().map(|fcid| fcid.to_string()).collect()), releases: None, state, @@ -1628,6 +1636,7 @@ impl EntityCrud for WebcaptureEntity { // these unwraps safe because of check above original_url: model.original_url.clone().unwrap(), timestamp: model.timestamp.unwrap().naive_utc(), + content_scope: model.content_scope.clone(), extra_json: model.extra.clone(), }) .collect::<Vec<WebcaptureRevNewRow>>(), diff --git a/rust/src/server.rs b/rust/src/server.rs index 4d30ecbe..fb55f03c 100644 --- a/rust/src/server.rs +++ b/rust/src/server.rs @@ -72,6 +72,7 @@ pub fn create_test_server() -> Result<Server> { diesel_migrations::revert_latest_migration(&conn).unwrap(); diesel_migrations::revert_latest_migration(&conn).unwrap(); diesel_migrations::revert_latest_migration(&conn).unwrap(); + diesel_migrations::revert_latest_migration(&conn).unwrap(); diesel_migrations::run_pending_migrations(&conn).unwrap(); Ok(server) } diff --git a/rust/tests/test_api_server_http.rs b/rust/tests/test_api_server_http.rs index 8f691e0d..0601a26b 100644 --- a/rust/tests/test_api_server_http.rs +++ b/rust/tests/test_api_server_http.rs @@ -636,6 +636,7 @@ fn test_post_file() { {"url": "http://web.archive.org/2/http://archive.org/asdf.txt", "rel": "webarchive" } ], "mimetype": "application/pdf", + "content_scope": "article", "release_ids": [ "aaaaaaaaaaaaarceaaaaaaaaae", "aaaaaaaaaaaaarceaaaaaaaaai" @@ -711,6 +712,7 @@ fn test_post_fileset() { "aaaaaaaaaaaaarceaaaaaaaaae", "aaaaaaaaaaaaarceaaaaaaaaai" ], + "content_scope": "dataset", "extra": { "source": "speculation" } }"#, &router, @@ -764,6 +766,7 @@ fn test_post_webcapture() { headers.clone(), r#"{"original_url": "https://bnewbold.net/", "timestamp": "2018-12-28T05:06:07Z", + "content_scope": "landing-page", "cdx": [ {"surt": "org,asheesh,)/robots.txt", "timestamp": "2018-12-28T05:06:07Z", |