diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-20 18:02:56 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-20 18:02:56 -0700 |
commit | a72019e6e788be64420719c5045e40614098c106 (patch) | |
tree | 4c44d08f15c462ba55691b539c81cde80325e223 /python/scripts | |
parent | 8e4a39cfce3d9ba1bec98855831be2cebdd951be (diff) | |
download | sandcrawler-a72019e6e788be64420719c5045e40614098c106.tar.gz sandcrawler-a72019e6e788be64420719c5045e40614098c106.zip |
doaj and unpaywall transforms: more domains to skip
Diffstat (limited to 'python/scripts')
-rwxr-xr-x | python/scripts/doaj2ingestrequest.py | 1 | ||||
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 3 |
2 files changed, 1 insertions, 3 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index 67286b9..aef5c12 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -23,6 +23,7 @@ DOMAIN_BLOCKLIST = [ "ncbi.nlm.nih.gov/", # "semanticscholar.org/", "://doi.org/", + "://dx.doi.org/", "zenodo.org/", "figshare.com/", "://archive.org/", diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index ad5353b..cb64a1a 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -15,12 +15,9 @@ DOMAIN_BLOCKLIST = [ "://arxiv.org/", "://europepmc.org/", "ncbi.nlm.nih.gov/", - "semanticscholar.org/", "://doi.org/", "zenodo.org/", "figshare.com/", - "://archive.org/", - ".archive.org/", ] RELEASE_STAGE_MAP = { |