diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-09-28 15:40:35 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-09-28 15:40:35 -0700 |
commit | dfd4605d84712eccb95a63e50b0bcb343642b433 (patch) | |
tree | ff5f8fd115ca3d6517724dae3788d600914ae03f /python | |
parent | a20e88ae7794c0ac15a086504318795c554307d0 (diff) | |
download | sandcrawler-dfd4605d84712eccb95a63e50b0bcb343642b433.tar.gz sandcrawler-dfd4605d84712eccb95a63e50b0bcb343642b433.zip |
update oai-pmh ingest request transform script
Diffstat (limited to 'python')
-rwxr-xr-x | python/scripts/oai2ingestrequest.py | 40 |
1 files changed, 38 insertions, 2 deletions
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py index 9607b85..97c38f9 100755 --- a/python/scripts/oai2ingestrequest.py +++ b/python/scripts/oai2ingestrequest.py @@ -25,10 +25,40 @@ DOMAIN_BLOCKLIST = [ "://archive.org/", ".archive.org/", "://127.0.0.1/", + "://www.kb.dk/", + "://kb-images.kb.dk/", + "://mdz-nbn-resolving.de/", + "://aggr.ukm.um.si/", + "://edoc.mpg.de/", + "doaj.org/", + "orcid.org/", + "://gateway.isiknowledge.com/", # OAI specific additions "://hdl.handle.net/", ] +# OAI identifier prefixes for repositories that we want to skip (for various reasons) +OAI_BLOCKLIST = [ + "oai:kb.dk:", + "oai:bdr.oai.bsb-muenchen.de:", + "oai:hispana.mcu.es:", + "oai:bnf.fr:", + "oai:ukm.si:", + "oai:biodiversitylibrary.org:", + "oai:hsp.org:", + "oai:repec:", + "oai:n/a:", + "oai:quod.lib.umich.edu:", + "oai:americanae.aecid.es:", + "oai:www.irgrid.ac.cn:", + "oai:espace.library.uq.edu:", + "oai:edoc.mpg.de:", + "oai:bibliotecadigital.jcyl.es:", + "oai:repository.erciyes.edu.tr:", + "oai:krm.or.kr:", + "oai:hypotheses.org:%", +] + RELEASE_STAGE_MAP = { "info:eu-repo/semantics/draftVersion": "draft", "info:eu-repo/semantics/submittedVersion": "submitted", @@ -55,6 +85,11 @@ def transform(obj): if not obj.get("urls"): return [] + oai_id = obj["oai"].lower() + for prefix in OAI_BLOCKLIST: + if oai_id.startswith(prefix): + return [] + # look in obj['formats'] for PDF? if obj.get("formats"): # if there is a list of formats, and it does not contain PDF, then @@ -97,16 +132,17 @@ def transform(obj): "base_url": base_url, "ingest_type": "pdf", "link_source": "oai", - "link_source_id": obj["oai"].lower(), + "link_source_id": oai_id, "ingest_request_source": "metha-bulk", "release_stage": release_stage, "rel": rel, "ext_ids": { - "doi": doi, "oai": obj["oai"].lower(), }, "edit_extra": {}, } + if doi: + request["ext_ids"]["doi"] = doi requests.append(request) return requests |