aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-20 18:02:56 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-20 18:02:56 -0700
commita72019e6e788be64420719c5045e40614098c106 (patch)
tree4c44d08f15c462ba55691b539c81cde80325e223 /python/scripts
parent8e4a39cfce3d9ba1bec98855831be2cebdd951be (diff)
downloadsandcrawler-a72019e6e788be64420719c5045e40614098c106.tar.gz
sandcrawler-a72019e6e788be64420719c5045e40614098c106.zip
doaj and unpaywall transforms: more domains to skip
Diffstat (limited to 'python/scripts')
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py1
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py3
2 files changed, 1 insertions, 3 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index 67286b9..aef5c12 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -23,6 +23,7 @@ DOMAIN_BLOCKLIST = [
"ncbi.nlm.nih.gov/",
# "semanticscholar.org/",
"://doi.org/",
+ "://dx.doi.org/",
"zenodo.org/",
"figshare.com/",
"://archive.org/",
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index ad5353b..cb64a1a 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -15,12 +15,9 @@ DOMAIN_BLOCKLIST = [
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
- "semanticscholar.org/",
"://doi.org/",
"zenodo.org/",
"figshare.com/",
- "://archive.org/",
- ".archive.org/",
]
RELEASE_STAGE_MAP = {