aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-09-28 15:40:35 -0700
committerBryan Newbold <bnewbold@archive.org>2022-09-28 15:40:35 -0700
commitdfd4605d84712eccb95a63e50b0bcb343642b433 (patch)
treeff5f8fd115ca3d6517724dae3788d600914ae03f /python/scripts
parenta20e88ae7794c0ac15a086504318795c554307d0 (diff)
downloadsandcrawler-dfd4605d84712eccb95a63e50b0bcb343642b433.tar.gz
sandcrawler-dfd4605d84712eccb95a63e50b0bcb343642b433.zip
update oai-pmh ingest request transform script
Diffstat (limited to 'python/scripts')
-rwxr-xr-xpython/scripts/oai2ingestrequest.py40
1 files changed, 38 insertions, 2 deletions
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 9607b85..97c38f9 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -25,10 +25,40 @@ DOMAIN_BLOCKLIST = [
"://archive.org/",
".archive.org/",
"://127.0.0.1/",
+ "://www.kb.dk/",
+ "://kb-images.kb.dk/",
+ "://mdz-nbn-resolving.de/",
+ "://aggr.ukm.um.si/",
+ "://edoc.mpg.de/",
+ "doaj.org/",
+ "orcid.org/",
+ "://gateway.isiknowledge.com/",
# OAI specific additions
"://hdl.handle.net/",
]
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+ "oai:kb.dk:",
+ "oai:bdr.oai.bsb-muenchen.de:",
+ "oai:hispana.mcu.es:",
+ "oai:bnf.fr:",
+ "oai:ukm.si:",
+ "oai:biodiversitylibrary.org:",
+ "oai:hsp.org:",
+ "oai:repec:",
+ "oai:n/a:",
+ "oai:quod.lib.umich.edu:",
+ "oai:americanae.aecid.es:",
+ "oai:www.irgrid.ac.cn:",
+ "oai:espace.library.uq.edu:",
+ "oai:edoc.mpg.de:",
+ "oai:bibliotecadigital.jcyl.es:",
+ "oai:repository.erciyes.edu.tr:",
+ "oai:krm.or.kr:",
+ "oai:hypotheses.org:%",
+]
+
RELEASE_STAGE_MAP = {
"info:eu-repo/semantics/draftVersion": "draft",
"info:eu-repo/semantics/submittedVersion": "submitted",
@@ -55,6 +85,11 @@ def transform(obj):
if not obj.get("urls"):
return []
+ oai_id = obj["oai"].lower()
+ for prefix in OAI_BLOCKLIST:
+ if oai_id.startswith(prefix):
+ return []
+
# look in obj['formats'] for PDF?
if obj.get("formats"):
# if there is a list of formats, and it does not contain PDF, then
@@ -97,16 +132,17 @@ def transform(obj):
"base_url": base_url,
"ingest_type": "pdf",
"link_source": "oai",
- "link_source_id": obj["oai"].lower(),
+ "link_source_id": oai_id,
"ingest_request_source": "metha-bulk",
"release_stage": release_stage,
"rel": rel,
"ext_ids": {
- "doi": doi,
"oai": obj["oai"].lower(),
},
"edit_extra": {},
}
+ if doi:
+ request["ext_ids"]["doi"] = doi
requests.append(request)
return requests