summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/cleanups/release_lowercase_doi.py
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2021-11-11 01:11:49 +0000
committerbnewbold <bnewbold@archive.org>2021-11-11 01:11:49 +0000
commit7e3f91f1a49ea85707cae31125021ba761f5373d (patch)
tree34c482d15821765ffd7a27f6f049c320a2bf4b2a /python/fatcat_tools/cleanups/release_lowercase_doi.py
parentb6d228b7171252c8f9f70194c09aba0ed0c55567 (diff)
parentcd09c6d6bd4deef0627de4f8a8a301725db01e14 (diff)
downloadfatcat-7e3f91f1a49ea85707cae31125021ba761f5373d.tar.gz
fatcat-7e3f91f1a49ea85707cae31125021ba761f5373d.zip
Merge branch 'bnewbold-cleanups-nov2021' into 'master'
Fatcat metadata cleanups/fixups, November 2021 Three cleanups implemented in this branch: - update non-lowercase DOIs on releases (couple hundred thousand entities) - fix incorrectly imported file/release pairs, on the file entity side (~250k entities) - expand truncated wayback URL timestamps in file entities (up to 10 million entities) Instead of proposals, there are documents for each cleanup in `notes/cleanups/`. Have done spot testing of tens of thousands of entities each in QA, and confident about running in production. Plan is to run updates in the order above. DOI and bugfix updates will go fairly fast; the wayback timestamp updates will go slower, and result in large re-indexing load both in fatcat and scholar, because both release and work entities will get triggered for update when file entities are updated.
Diffstat (limited to 'python/fatcat_tools/cleanups/release_lowercase_doi.py')
-rw-r--r--python/fatcat_tools/cleanups/release_lowercase_doi.py154
1 files changed, 154 insertions, 0 deletions
diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py
new file mode 100644
index 00000000..5e3275db
--- /dev/null
+++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py
@@ -0,0 +1,154 @@
+import argparse
+import os
+import sys
+
+from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds
+
+from fatcat_tools import authenticated_api, public_api
+from fatcat_tools.importers.common import EntityImporter, LinePusher
+
+
+class ReleaseLowercaseDoiCleanup(EntityImporter):
+ """
+ This is a one-off / one-time cleanup script for release entities, to fix
+ upper-case DOIs. In fatcat, all DOIs should be normalized to lowercase.
+
+ While this calls itself a cleanup, it is based on the import code path. It
+ is not integrated into the `fatcat_import` or `fatcat_cleanup` controller;
+ instead it has a __main__ function and is invoked like:
+
+ python -m fatcat_tools.cleans.release_lowercase_doi - < blah.tsv
+
+ It expects to get a simple text line on stdin, which is a release entity.
+ The correction is implemented by fetching the current version of the
+ entity, verifying the issue, and updating if it is still a problem.
+
+ This does not try to do any merging, just corrects the case in a single
+ update.
+ """
+
+ def __init__(self, api: ApiClient, **kwargs):
+
+ eg_desc = (
+ kwargs.pop("editgroup_description", None)
+ or "Normalize release DOIs (extid) to lower-case"
+ )
+ eg_extra = kwargs.pop("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ReleaseLowercaseDoiCleanup")
+ super().__init__(
+ api,
+ do_updates=True,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs,
+ )
+ self.testing_mode = False
+
+ def want(self, row: str) -> bool:
+ row = row.strip()
+ if not row:
+ return False
+ row = row.split()[0]
+ if len(row) == 26:
+ return True
+ else:
+ return False
+
+ def parse_record(self, row: str) -> ReleaseEntity:
+
+ # bezerk mode doesn't make sense for this importer
+ assert self.bezerk_mode is False
+
+ ident = row.strip().split()[0]
+ assert len(ident) == 26
+
+ return ReleaseEntity(
+ ident=ident,
+ ext_ids=ReleaseExtIds(),
+ )
+
+ def try_update(self, re: ReleaseEntity) -> bool:
+
+ # should always be existing, but sometimes not because of prod/QA flip
+ existing = None
+ try:
+ existing = self.api.get_release(re.ident)
+ except ApiException as err:
+ if err.status != 404:
+ raise err
+
+ if not existing:
+ self.counts["skip-existing-not-found"] += 1
+ return False
+
+ if existing.state != "active":
+ self.counts["skip-existing-entity-state"] += 1
+ return False
+
+ if not existing.ext_ids.doi:
+ self.counts["skip-existing-no-doi"] += 1
+ return False
+
+ if existing.ext_ids.doi == existing.ext_ids.doi.lower():
+ self.counts["skip-existing-doi-fine"] += 1
+ return False
+
+ existing.ext_ids.doi = existing.ext_ids.doi.lower()
+
+ # not doing a check for "in current editgroup", because the source of
+ # these corrections (entity dump) contains no dupes
+
+ if not self.testing_mode:
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
+ self.counts["update"] += 1
+ return False
+
+
+def test_lowercase_doi() -> None:
+ api = public_api("http://localhost:9411/v0")
+ rldc = ReleaseLowercaseDoiCleanup(api=api)
+ rldc.testing_mode = True
+
+ assert rldc.want("") is False
+ assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai") is True
+ assert rldc.want("aaaaaaaaaaaaarceaaaaaaaaai\t10.1234/ABCD") is True
+ rldc.parse_record("aaaaaaaaaaaaarceaaaaaaaaai")
+
+ dummy_re = api.get_release("aaaaaaaaaaaaarceaaaaaaaaai")
+ assert rldc.try_update(dummy_re) is False
+ assert rldc.counts["skip-existing-doi-fine"] == 1
+ # this isn't a very complete test, doesn't get to update part
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "--host-url", default="http://localhost:9411/v0", help="connect to this host/port"
+ )
+ parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int)
+ parser.set_defaults(
+ auth_var="FATCAT_AUTH_WORKER_CLEANUP",
+ )
+ parser.add_argument(
+ "idents_file",
+ help="File with release identifier to try updating",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
+
+ args = parser.parse_args()
+ api = authenticated_api(
+ args.host_url,
+ # token is an optional kwarg (can be empty string, None, etc)
+ token=os.environ.get(args.auth_var),
+ )
+
+ rldc = ReleaseLowercaseDoiCleanup(
+ api,
+ edit_batch_size=args.batch_size,
+ )
+ LinePusher(rldc, args.idents_file).run()
+
+
+if __name__ == "__main__":
+ main()