From 3a8dada3267c56fd62b84201b4af96889e4103e6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 20 Apr 2022 16:05:29 -0700 Subject: cleanups: isiarticles --- extra/bulk_edits/2022-04-20_isiarticles.md | 26 ++++++++++++++++++++++++++ extra/bulk_edits/CHANGELOG.md | 8 ++++++++ extra/cleanups/file_isiarticles.md | 15 +++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 extra/bulk_edits/2022-04-20_isiarticles.md create mode 100644 extra/cleanups/file_isiarticles.md diff --git a/extra/bulk_edits/2022-04-20_isiarticles.md b/extra/bulk_edits/2022-04-20_isiarticles.md new file mode 100644 index 00000000..ca2cc6f9 --- /dev/null +++ b/extra/bulk_edits/2022-04-20_isiarticles.md @@ -0,0 +1,26 @@ + +See metadata cleanups for context. Basically a couple tens of thousands of sample/spam articles hosted on the domain isiarticles.com. + +## Prod Updates + +Start small: + + export FATCAT_API_HOST=https://api.fatcat.wiki + export FATCAT_AUTH_WORKER_CLEANUP=[...] + export FATCAT_API_AUTH_TOKEN=$FATCAT_AUTH_WORKER_CLEANUP + + fatcat-cli search file domain:isiarticles.com --entity-json -n0 \ + | rg -v '"content_scope"' \ + | rg 'isiarticles.com/' \ + | head -n50 \ + | pv -l \ + | fatcat-cli batch update file release_ids= content_scope=sample --description 'Un-link and mark isiarticles PDFs as content_scope=sample' --auto-accept + # editgroup_ihx75kzsebgzfisgjrv67zew5e + +The full batch: + + fatcat-cli search file domain:isiarticles.com --entity-json -n0 \ + | rg -v '"content_scope"' \ + | rg 'isiarticles.com/' \ + | pv -l \ + | fatcat-cli batch update file release_ids= content_scope=sample --description 'Un-link and mark isiarticles PDFs as content_scope=sample' --auto-accept diff --git a/extra/bulk_edits/CHANGELOG.md b/extra/bulk_edits/CHANGELOG.md index b6bfcb96..94a32947 100644 --- a/extra/bulk_edits/CHANGELOG.md +++ b/extra/bulk_edits/CHANGELOG.md @@ -9,6 +9,14 @@ this file should probably get merged into the guide at some point. This file should not turn in to a TODO list! +## 2022-04 + +Imported some initial fileset entities. + +Updated about 25k file entities from isiarticles.com, which are samples (spam +for translation service) to remove release linkage and set +`content_scope=sample` (similar to the springer "page one" case). + ## 2022-03 Ran a journal-level metadata update, using chocula. diff --git a/extra/cleanups/file_isiarticles.md b/extra/cleanups/file_isiarticles.md new file mode 100644 index 00000000..cb3785af --- /dev/null +++ b/extra/cleanups/file_isiarticles.md @@ -0,0 +1,15 @@ + +The domain isiarticles.com hosts a bunch of partial spam PDFs. + +As a first pass, we can remove these via the domain itself. + +A "blocklist" for this domain has been added to sandcrawler, so they should not +get auto-ingested in the future. + + # 2022-04-20 + fatcat-cli search file domain:isiarticles.com --count + 25067 + +## Prod Cleanup + +See bulk edits log. -- cgit v1.2.3