summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2021-11-11 01:11:49 +0000
committerbnewbold <bnewbold@archive.org>2021-11-11 01:11:49 +0000
commit7e3f91f1a49ea85707cae31125021ba761f5373d (patch)
tree34c482d15821765ffd7a27f6f049c320a2bf4b2a /python/fatcat_tools
parentb6d228b7171252c8f9f70194c09aba0ed0c55567 (diff)
parentcd09c6d6bd4deef0627de4f8a8a301725db01e14 (diff)
downloadfatcat-7e3f91f1a49ea85707cae31125021ba761f5373d.tar.gz
fatcat-7e3f91f1a49ea85707cae31125021ba761f5373d.zip
Merge branch 'bnewbold-cleanups-nov2021' into 'master'
Fatcat metadata cleanups/fixups, November 2021 Three cleanups implemented in this branch: - update non-lowercase DOIs on releases (couple hundred thousand entities) - fix incorrectly imported file/release pairs, on the file entity side (~250k entities) - expand truncated wayback URL timestamps in file entities (up to 10 million entities) Instead of proposals, there are documents for each cleanup in `notes/cleanups/`. Have done spot testing of tens of thousands of entities each in QA, and confident about running in production. Plan is to run updates in the order above. DOI and bugfix updates will go fairly fast; the wayback timestamp updates will go slower, and result in large re-indexing load both in fatcat and scholar, because both release and work entities will get triggered for update when file entities are updated.
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/cleanups/file_release_bugfix.py241
-rw-r--r--python/fatcat_tools/cleanups/file_short_wayback_ts.py344
-rw-r--r--python/fatcat_tools/cleanups/release_lowercase_doi.py154
-rw-r--r--python/fatcat_tools/importers/common.py9
4 files changed, 748 insertions, 0 deletions
diff --git a/python/fatcat_tools/cleanups/file_release_bugfix.py b/python/fatcat_tools/cleanups/file_release_bugfix.py
new file mode 100644
index 00000000..dc27f9b5
--- /dev/null
+++ b/python/fatcat_tools/cleanups/file_release_bugfix.py
@@ -0,0 +1,241 @@
+import argparse
+import os
+import sys
+from typing import Any, Dict
+
+import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
+
+from fatcat_tools import authenticated_api, public_api, uuid2fcid
+from fatcat_tools.importers.common import EntityImporter, JsonLinePusher
+from fatcat_tools.normal import clean_doi
+
+
+class FileReleaseBugfix(EntityImporter):
+ """
+ This is a one-off / one-time cleanup script for file entities which got
+ imported with incorrect release ident mappings, due to a bug in the file
+ ingest importer.
+
+ While this calls itself a cleanup, it is based on the import code path. It
+ is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+ instead it has a __main__ function and is invoked like:
+
+ python -m fatcat_tools.cleans.file_release_bugfix - < blah.json
+ """
+
+ def __init__(self, api: ApiClient, **kwargs):
+
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Correct bad file/release import mappings"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.FileReleaseBugfix")
+ super().__init__(
+ api,
+ do_updates=True,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs,
+ )
+ self.testing_mode = False
+
+ def want(self, row: Dict[str, Any]) -> bool:
+ if not (
+ row.get("edit_extra")
+ and row["edit_extra"].get("link_source")
+ and row["edit_extra"].get("link_source_id")
+ ):
+ self.counts["skip-partial"] += 1
+ return False
+ if row["edit_extra"]["link_source"] not in ["unpaywall", "doi"]:
+ self.counts["skip-link-source"] += 1
+ return False
+ if row["edit_extra"].get("ingest_request_source") not in [
+ "unpaywall",
+ "fatcat-changelog",
+ ]:
+ self.counts["skip-ingest-request-source"] += 1
+ return False
+ if not row["edit_extra"]["link_source_id"].startswith("10."):
+ self.counts["skip-source-id-not-doi"] += 1
+ return False
+ return True
+
+ def parse_record(self, row: Dict[str, Any]) -> FileEntity:
+
+ # bezerk mode doesn't make sense for this importer
+ assert self.bezerk_mode is False
+
+ file_ident = uuid2fcid(row["file_ident"])
+ wrong_release_ident = uuid2fcid(row["wrong_release_ident"])
+ edit_extra = row["edit_extra"]
+ assert edit_extra["link_source"] in ["unpaywall", "doi"]
+ file_edit_doi = clean_doi(edit_extra["link_source_id"])
+
+ if not file_edit_doi:
+ self.counts["skip-bad-doi"] += 1
+ return False
+
+ # check that the "wrong" release exists and doesn't have the DOI
+ wrong_release = None
+ try:
+ wrong_release = self.api.get_release(wrong_release_ident)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not wrong_release:
+ self.counts["skip-wrong-release-missing"] += 1
+ return None
+
+ if clean_doi(wrong_release.ext_ids.doi) == file_edit_doi:
+ self.counts["skip-wrong-release-is-ok"] += 1
+ return None
+
+ # fetch the "correct" release, if any
+ fixed_release_ids = []
+ correct_release = None
+ try:
+ correct_release = self.api.lookup_release(doi=file_edit_doi)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if correct_release:
+ fixed_release_ids.append(correct_release.ident)
+
+ fe = FileEntity(
+ ident=file_ident,
+ release_ids=fixed_release_ids,
+ edit_extra=edit_extra,
+ )
+ fe._wrong_release_ident = wrong_release_ident
+ return fe
+
+ def try_update(self, fe: FileEntity) -> bool:
+
+ wrong_release_ident = fe._wrong_release_ident
+ assert len(wrong_release_ident) == 26
+
+ # should always be existing... but in QA it might not be
+ existing = None
+ try:
+ existing = self.api.get_file(fe.ident)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ self.counts["skip-existing-not-found"] += 1
+ return False
+
+ if existing.state != "active":
+ self.counts["skip-existing-entity-state"] += 1
+ return False
+
+ if wrong_release_ident not in existing.release_ids:
+ self.counts["skip-existing-fixed"] += 1
+ return False
+
+ # fetch existing history to verify mismatch
+ history = self.api.get_file_history(existing.ident)
+
+ for entry in history:
+ if entry.editgroup.editor.is_bot is not True:
+ self.counts["skip-existing-edit-history-human"] += 1
+ return False
+
+ bad_edit = history[-1].edit
+ if bad_edit.extra != fe.edit_extra:
+ self.counts["skip-existing-edit-history-extra-mismatch"] += 1
+ return False
+
+ bad_editgroup = history[-1].editgroup
+ if not bad_editgroup.extra:
+ self.counts["skip-existing-editgroup-missing-extra"] += 1
+ return False
+
+ if (
+ bad_editgroup.editor_id != "scmbogxw25evtcesfcab5qaboa"
+ or bad_editgroup.extra.get("agent") != "fatcat_tools.IngestFileResultImporter"
+ or not bad_editgroup.extra.get("git_rev", "").startswith("v0.3")
+ or bad_editgroup.created.year != 2020
+ ):
+ self.counts["skip-existing-edit-history-mismatch"] += 1
+ return False
+
+ existing.release_ids = [ri for ri in existing.release_ids if ri != wrong_release_ident]
+
+ if len(fe.release_ids) == 1:
+ if fe.release_ids[0] not in existing.release_ids:
+ existing.release_ids.append(fe.release_ids[0])
+
+ existing.edit_extra = fe.edit_extra
+
+ # not doing a check for "in current editgroup", because the source of
+ # these corrections (entity dump) contains no dupes
+
+ if not self.testing_mode:
+ self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
+ self.counts["update"] += 1
+ return False
+
+
+def test_file_release_bugfix() -> None:
+ api = public_api("http://localhost:9411/v0")
+ frbc = FileReleaseBugfix(api=api)
+ frbc.testing_mode = True
+
+ assert frbc.want({"this": "asdf"}) is False
+
+ example_line: Dict[str, Any] = {
+ "file_ident": "00000000-0000-0000-3333-000000000002",
+ "wrong_release_ident": "00000000-0000-0000-4444-000000000002",
+ "edit_extra": {
+ "link_source": "unpaywall",
+ "link_source_id": "10.1371/journal.pmed.0020124",
+ "ingest_request_source": "unpaywall",
+ },
+ }
+
+ fe1 = frbc.parse_record(example_line)
+ print(frbc.counts)
+ frbc.try_update(fe1)
+
+ # NOTE: this test is pretty incompleted
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+ parser.set_defaults(
+ auth_var="FATCAT_AUTH_WORKER_CLEANUP",
+ )
+ parser.add_argument(
+ "json_file",
+ help="File with jsonlines with cleanup context",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ args = parser.parse_args()
+ api = authenticated_api(
+ args.host_url,
+ # token is an optional kwarg (can be empty string, None, etc)
+ token=os.environ.get(args.auth_var),
+ )
+
+ frbc = FileReleaseBugfix(
+ api,
+ edit_batch_size=args.batch_size,
+ )
+ JsonLinePusher(frbc, args.json_file).run()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/fatcat_tools/cleanups/file_short_wayback_ts.py b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
new file mode 100644
index 00000000..bdd49f9b
--- /dev/null
+++ b/python/fatcat_tools/cleanups/file_short_wayback_ts.py
@@ -0,0 +1,344 @@
+import argparse
+import copy
+import os
+import sys
+from typing import Any, Dict
+
+import fatcat_openapi_client
+from fatcat_openapi_client import ApiClient, FileEntity
+
+from fatcat_tools import authenticated_api, entity_from_dict, public_api
+from fatcat_tools.importers.common import EntityImporter, JsonLinePusher
+
+
+class FileShortWaybackTimestampCleanup(EntityImporter):
+ """
+ This is a one-off / one-time cleanup script for file entities, fix short
+ timestamps in wayback URLs. These timestamps are supposed to have 14 digits
+ (datetime with year, hour, seconds, etc). Some legacy file imports ended up
+ with only 4 or 12 digits.
+
+ While this calls itself a cleanup, it is based on the import code path. It
+ is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+ instead it has a __main__ function and is invoked like:
+
+ python -m fatcat_tools.cleans.file_short_wayback_ts - < blah.json
+ """
+
+ def __init__(self, api: ApiClient, **kwargs):
+
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Expand trunacted timestamps in wayback URLs"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get(
+ "agent", "fatcat_tools.FileShortWaybackTimestampCleanup"
+ )
+ super().__init__(
+ api,
+ do_updates=True,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs,
+ )
+ self.testing_mode = False
+
+ def want(self, row: Dict[str, Any]) -> bool:
+ if row["status"].startswith("success"):
+ return True
+ else:
+ self.counts["skip-status"] += 1
+ return False
+
+ def parse_record(self, row: Dict[str, Any]) -> FileEntity:
+
+ # bezerk mode doesn't make sense for this importer
+ assert self.bezerk_mode is False
+
+ fe: FileEntity = entity_from_dict(row["file_entity"], FileEntity)
+ status: str = row["status"]
+ assert status.startswith("success")
+ url_expansions: Dict[str, str] = row["full_urls"]
+ assert len(url_expansions) >= 1
+
+ # actual cleanup happens here
+ any_fixed = False
+ for fe_url in fe.urls:
+ if "://web.archive.org/web/" not in fe_url.url:
+ continue
+ seq = fe_url.url.split("/")
+ partial_ts = seq[4]
+ original_url = "/".join(seq[5:])
+ if seq[2] != "web.archive.org":
+ continue
+ if len(partial_ts) not in [4, 12]:
+ continue
+ if fe_url.url in url_expansions:
+ fix_url = url_expansions[fe_url.url]
+ # defensive checks
+ if not (
+ f"/web/{partial_ts}" in fix_url
+ and fe_url.url.endswith(original_url)
+ and fix_url.endswith(original_url)
+ ):
+ print(
+ f"bad replacement URL: partial_ts={partial_ts} original={original_url} fix_url={fix_url}",
+ file=sys.stderr,
+ )
+ self.counts["skip-bad-replacement"] += 1
+ return None
+ assert "://" in fix_url
+ fe_url.url = fix_url
+ any_fixed = True
+
+ if not any_fixed:
+ self.counts["skip-no-fixes"] += 1
+ return None
+
+ # do any other generic file entity cleanups
+ # this includes removing duplicates
+ fe = self.generic_file_cleanups(fe)
+
+ # verify that there are no exact duplicates
+ final_urls = [u.url for u in fe.urls]
+ assert len(final_urls) == len(list(set(final_urls)))
+
+ return fe
+
+ def try_update(self, fe: FileEntity) -> bool:
+
+ # should always be existing
+ try:
+ existing = self.api.get_file(fe.ident)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ self.counts["skip-existing-not-found"] += 1
+ return False
+
+ if existing.state != "active":
+ self.counts["skip-existing-entity-state"] += 1
+ return False
+
+ if existing.sha1 != fe.sha1:
+ self.counts["skip-existing-mismatch"] += 1
+ return False
+
+ assert fe.revision and existing.revision
+ if existing.revision != fe.revision:
+ self.counts["skip-revision-changed"] += 1
+ return False
+
+ # verify that at least one URL remains
+ if not fe.urls or len(fe.urls) < 1:
+ self.counts["skip-no-urls"] += 1
+ return False
+
+ # verify that all wayback urls have 14-digit timestamps, and are generally well-formed
+ for u in fe.urls:
+ if "://web.archive.org/web/" not in u.url:
+ continue
+ if u.rel != "webarchive":
+ self.counts["skip-bad-wayback-rel"] += 1
+ return False
+ seg = u.url.split("/")
+ if (
+ len(seg) < 6
+ or seg[0] != "https:"
+ or seg[2] != "web.archive.org"
+ or seg[3] != "web"
+ ):
+ self.counts["skip-bad-wayback"] += 1
+ return False
+ if len(seg[4]) != 14 or not seg[4].isdigit():
+ self.counts["skip-bad-wayback-timestamp"] += 1
+ return False
+
+ if existing == fe or existing.urls == fe.urls:
+ self.counts["skip-no-change"] += 1
+ return False
+
+ # not doing a check for "in current editgroup", because the source of
+ # these corrections (entity dump) contains no dupes
+
+ if not self.testing_mode:
+ # note: passing 'fe' instead of 'existing' here, which is not
+ # usually how it goes!
+ self.api.update_file(self.get_editgroup_id(), fe.ident, fe)
+ self.counts["update"] += 1
+ return False
+
+
+def test_short_wayback_ts() -> None:
+ api = public_api("http://localhost:9411/v0")
+ fswtc = FileShortWaybackTimestampCleanup(api=api)
+ fswtc.testing_mode = True
+
+ assert fswtc.want({"status": "fail"}) is False
+ assert fswtc.want({"status": "success-self"}) is True
+
+ example_line: Dict[str, Any] = {
+ "status": "success-db",
+ "file_entity": {
+ # note: doesn't match actual entity
+ "release_ids": ["waldfsctnbcpdbmasgduhaaaaa"],
+ "mimetype": "application/pdf",
+ "urls": [
+ {
+ "url": "https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
+ "rel": "web",
+ },
+ {
+ "url": "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971",
+ "rel": "webarchive",
+ },
+ ],
+ "sha256": "0b9e09480ed2e1f08f3c6c72f57ce12b52ea265f580f8810e606b49d64234b29",
+ "sha1": "be714299b9be21b5afdaa7affd7d710c58269433",
+ "md5": "9edb542be5b3446a1905e61a8a3abebd",
+ "size": 666242,
+ "revision": "fe949be8-7bf9-4c17-be28-8e3e90fb85bd",
+ "ident": "4ghpvs2t2rdtrdum2mkreh62me",
+ "state": "active",
+ },
+ "full_urls": {
+ "https://web.archive.org/web/201904301022/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971": "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
+ },
+ }
+ example_fe = entity_from_dict(example_line["file_entity"], FileEntity)
+
+ fe1 = copy.copy(example_fe)
+ fe1.urls[
+ 1
+ ].url = "https://web.archive.org/web/20190430102239/https://papiro.unizar.es/ojs/index.php/ais/article/download/2187/1971"
+ assert fswtc.parse_record(example_line) == fe1
+
+ # update code path; requires a known file ident and API running locally
+ assert fswtc.counts["update"] == 0
+ dummy_fe = api.get_file("aaaaaaaaaaaaamztaaaaaaaaai")
+ fe1.ident = dummy_fe.ident
+
+ assert fswtc.try_update(fe1) is False
+ assert fswtc.counts["skip-existing-mismatch"] == 1
+
+ fe1.sha1 = dummy_fe.sha1
+ assert fswtc.try_update(fe1) is False
+ assert fswtc.counts["skip-revision-changed"] == 1
+
+ fe1.revision = dummy_fe.revision
+ assert fswtc.try_update(fe1) is False
+ print(fswtc.counts)
+ assert fswtc.counts["update"] == 1
+
+ # another example, which failed with an assertion in prod due to duplicated URLs
+ example_line2: Dict[str, Any] = {
+ "file_entity": {
+ "release_ids": ["22jt7euq4fafhblzullmnesso4"],
+ "mimetype": "application/pdf",
+ "urls": [
+ {
+ "url": "https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+ "rel": "repository",
+ },
+ {
+ "url": "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+ "rel": "webarchive",
+ },
+ {
+ "url": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf",
+ "rel": "webarchive",
+ },
+ ],
+ "sha256": "51ec58e7a2325d28d1deb0a4bc6422c0e4ae7b12ffb0b6298981a7b8b7730b19",
+ "sha1": "ad96a584fc6073b9a23736bc61ae0ec4a5661433",
+ "md5": "3d509743359649e34a27ae70c5cd3018",
+ "size": 430665,
+ "extra": {
+ "shadows": {"scimag_doi": "10.4259/ibk.59.1_194", "scimag_id": "69089904"}
+ },
+ "revision": "f1fa11ff-d521-45cf-9db1-cb3c8bd3ea48",
+ "ident": "duymhmxk3fgtzk37yp2pvthtxq",
+ "state": "active",
+ },
+ "full_urls": {
+ "https://web.archive.org/web/201811010021/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf": "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
+ },
+ "status": "success-self",
+ }
+
+ fe2 = fswtc.parse_record(example_line2)
+ assert len(fe2.urls) == 2
+ assert fe2.urls[0].rel == "repository"
+ assert (
+ fe2.urls[1].url
+ == "https://web.archive.org/web/20181101002154/https://www.jstage.jst.go.jp/article/ibk/59/1/59_KJ00007115297/_pdf"
+ )
+
+ # ensure URL order is stable
+ example_line3: Dict[str, Any] = {
+ "file_entity": {
+ "release_ids": ["5rin7f2cdvc5hjkqqw53z7sr3i"],
+ "mimetype": "application/pdf",
+ "urls": [
+ {"url": "https://pubs.usgs.gov/bul/1108/report.pdf", "rel": "web"},
+ {
+ "url": "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf",
+ "rel": "webarchive",
+ },
+ ],
+ "sha256": "714cd48c2577e9b058b8f16b4574765da685f67582cc53898a9d6933e45d6cc0",
+ "sha1": "4efbdb517c0ff3f58136e4efbbec2bd9315400d3",
+ "md5": "89b6e6cc4e0259317e26ddf1a9a336a0",
+ "size": 41265,
+ "revision": "926fcf73-e644-4446-a24b-4d0940a2cf65",
+ "ident": "lvnz23nzijaapf5iti45zez6zu",
+ "state": "active",
+ },
+ "full_urls": {
+ "https://web.archive.org/web/201904291643/https://pubs.usgs.gov/bul/1108/report.pdf": "https://web.archive.org/web/20190429164342/https://pubs.usgs.gov/bul/1108/report.pdf"
+ },
+ "status": "success-db",
+ }
+
+ fe3 = fswtc.parse_record(example_line3)
+ assert len(fe3.urls) == 2
+ assert fe3.urls[0].rel == "web"
+ assert fe3.urls[0].url == "https://pubs.usgs.gov/bul/1108/report.pdf"
+ assert fe3.urls[1].rel == "webarchive"
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument("--batch-size", help="size of batch to send", default=100, type=int)
+ parser.set_defaults(
+ auth_var="FATCAT_AUTH_WORKER_CLEANUP",
+ )
+ parser.add_argument(
+ "json_file",
+ help="File with jsonlines from file_meta schema to import from",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ args = parser.parse_args()
+ api = authenticated_api(
+ args.host_url,
+ # token is an optional kwarg (can be empty string, None, etc)
+ token=os.environ.get(args.auth_var),
+ )
+
+ fswtc = FileShortWaybackTimestampCleanup(
+ api,
+ edit_batch_size=args.batch_size,
+ )
+ JsonLinePusher(fswtc, args.json_file).run()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py
new file mode 100644
index 00000000..5e3275db
--- /dev/null
+++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py
@@ -0,0 +1,154 @@
+import argparse
+import os
+import sys
+
+from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds
+
+from fatcat_tools import authenticated_api, public_api
+from fatcat_tools.importers.common import EntityImporter, LinePusher
+
+
+class ReleaseLowercaseDoiCleanup(EntityImporter):
+ """
+ This is a one-off / one-time cleanup script for release entities, to fix
+ upper-case DOIs. In fatcat, all DOIs should be normalized to lowercase.
+
+ While this calls itself a cleanup, it is based on the import code path. It
+ is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+ instead it has a __main__ function and is invoked like:
+
+ python -m fatcat_tools.cleans.release_lowercase_doi - < blah.tsv
+
+ It expects to get a simple text line on stdin, which is a release entity.
+ The correction is implemented by fetching the current version of the
+ entity, verifying the issue, and updating if it is still a problem.
+
+ This does not try to do any merging, just corrects the case in a single
+ update.
+ """
+
+ def __init__(self, api: ApiClient, **kwargs):
+
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Normalize release DOIs (extid) to lower-case"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseLowercaseDoiCleanup")
+ super().__init__(
+ api,
+ do_updates=True,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs,
+ )
+ self.testing_mode = False
+
+ def want(self, row: str) -> bool:
+ row = row.strip()
+ if not row:
+ return False
+ row = row.split()[0]
+ if len(row) == 26:
+ return True
+ else:
+ return False
+
+ def parse_record(self, row: str) -> ReleaseEntity:
+
+ # bezerk mode doesn't make sense for this importer
+ assert self.bezerk_mode is False
+
+ ident = row.strip().split()[0]
+ assert len(ident) == 26
+
+ return ReleaseEntity(
+ ident=ident,
+ ext_ids=ReleaseExtIds(),
+ )
+
+ def try_update(self, re: ReleaseEntity) -> bool:
+
+ # should always be existing, but sometimes not because of prod/QA flip
+ existing = None
+ try:
+ existing = self.api.get_release(re.ident)
+ except ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ self.counts["skip-existing-not-found"] += 1
+ return False
+
+ if existing.state != "active":
+ self.counts["skip-existing-entity-state"] += 1
+ return False
+
+ if not existing.ext_ids.doi:
+ self.counts["skip-existing-no-doi"] += 1
+ return False
+
+ if existing.ext_ids.doi == existing.ext_ids.doi.lower():
+ self.counts["skip-existing-doi-fine"] += 1
+ return False
+
+ existing.ext_ids.doi = existing.ext_ids.doi.lower()
+
+ # not doing a check for "in current editgroup", because the source of
+ # these corrections (entity dump) contains no dupes
+
+ if not self.testing_mode:
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+ self.counts["update"] += 1
+ return False
+
+
+def test_lowercase_doi() -> None:
+ api = public_api("http://localhost:9411/v0")
+ rldc = ReleaseLowercaseDoiCleanup(api=api)
+ rldc.testing_mode = True
+
+ assert rldc.want("") is False
+ assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai") is True
+ assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai\t10.1234/ABCD") is True
+ rldc.parse_record("aaaaaaaaaaaaarceaaaaaaaaai")
+
+ dummy_re = api.get_release("aaaaaaaaaaaaarceaaaaaaaaai")
+ assert rldc.try_update(dummy_re) is False
+ assert rldc.counts["skip-existing-doi-fine"] == 1
+ # this isn't a very complete test, doesn't get to update part
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+ parser.set_defaults(
+ auth_var="FATCAT_AUTH_WORKER_CLEANUP",
+ )
+ parser.add_argument(
+ "idents_file",
+ help="File with release identifier to try updating",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ args = parser.parse_args()
+ api = authenticated_api(
+ args.host_url,
+ # token is an optional kwarg (can be empty string, None, etc)
+ token=os.environ.get(args.auth_var),
+ )
+
+ rldc = ReleaseLowercaseDoiCleanup(
+ api,
+ edit_batch_size=args.batch_size,
+ )
+ LinePusher(rldc, args.idents_file).run()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index fd472d11..2ec6efda 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -436,6 +436,15 @@ class EntityImporter:
if u.rel == "social":
u.rel = "academicsocial"
+ # remove exact URL duplicates, while preserving order, and removing
+ # "later" copies, not "first" copies
+ # this is sensitive to both url.url and url.rel combined!
+ dedupe_urls = []
+ for url_pair in existing.urls:
+ if url_pair not in dedupe_urls:
+ dedupe_urls.append(url_pair)
+ existing.urls = dedupe_urls
+
# remove URLs which are near-duplicates
redundant_urls = []
all_urls = [u.url for u in existing.urls]