From 3a8dada3267c56fd62b84201b4af96889e4103e6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 20 Apr 2022 16:05:29 -0700 Subject: cleanups: isiarticles --- extra/cleanups/file_isiarticles.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 extra/cleanups/file_isiarticles.md (limited to 'extra/cleanups/file_isiarticles.md') diff --git a/extra/cleanups/file_isiarticles.md b/extra/cleanups/file_isiarticles.md new file mode 100644 index 00000000..cb3785af --- /dev/null +++ b/extra/cleanups/file_isiarticles.md @@ -0,0 +1,15 @@ + +The domain isiarticles.com hosts a bunch of partial spam PDFs. + +As a first pass, we can remove these via the domain itself. + +A "blocklist" for this domain has been added to sandcrawler, so they should not +get auto-ingested in the future. + + # 2022-04-20 + fatcat-cli search file domain:isiarticles.com --count + 25067 + +## Prod Cleanup + +See bulk edits log. -- cgit v1.2.3