1 files changed, 76 insertions, 36 deletions
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 916f41c..97c38f9 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -1,19 +1,18 @@
 #!/usr/bin/env python3
-
 """
 Transform an OAI-PMH bulk dump (JSON) into ingest requests.
 
 Eg: https://archive.org/details/oai_harvest_20200215
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
+
 import urlcanon
 
 DOMAIN_BLOCKLIST = [
     # large OA publishers (we get via DOI)
-
     # large repos and aggregators (we crawl directly)
     "://arxiv.org/",
     "://europepmc.org/",
@@ -26,23 +25,54 @@ DOMAIN_BLOCKLIST = [
     "://archive.org/",
     ".archive.org/",
     "://127.0.0.1/",
-
+    "://www.kb.dk/",
+    "://kb-images.kb.dk/",
+    "://mdz-nbn-resolving.de/",
+    "://aggr.ukm.um.si/",
+    "://edoc.mpg.de/",
+    "doaj.org/",
+    "orcid.org/",
+    "://gateway.isiknowledge.com/",
     # OAI specific additions
     "://hdl.handle.net/",
 ]
 
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+    "oai:kb.dk:",
+    "oai:bdr.oai.bsb-muenchen.de:",
+    "oai:hispana.mcu.es:",
+    "oai:bnf.fr:",
+    "oai:ukm.si:",
+    "oai:biodiversitylibrary.org:",
+    "oai:hsp.org:",
+    "oai:repec:",
+    "oai:n/a:",
+    "oai:quod.lib.umich.edu:",
+    "oai:americanae.aecid.es:",
+    "oai:www.irgrid.ac.cn:",
+    "oai:espace.library.uq.edu:",
+    "oai:edoc.mpg.de:",
+    "oai:bibliotecadigital.jcyl.es:",
+    "oai:repository.erciyes.edu.tr:",
+    "oai:krm.or.kr:",
+    "oai:hypotheses.org:%",
+]
+
 RELEASE_STAGE_MAP = {
-    'info:eu-repo/semantics/draftVersion':     'draft',
-    'info:eu-repo/semantics/submittedVersion': 'submitted',
-    'info:eu-repo/semantics/acceptedVersion':  'accepted',
-    'info:eu-repo/semantics/publishedVersion': 'published',
-    'info:eu-repo/semantics/updatedVersion':   'updated',
+    "info:eu-repo/semantics/draftVersion": "draft",
+    "info:eu-repo/semantics/submittedVersion": "submitted",
+    "info:eu-repo/semantics/acceptedVersion": "accepted",
+    "info:eu-repo/semantics/publishedVersion": "published",
+    "info:eu-repo/semantics/updatedVersion": "updated",
 }
 
+
 def canon(s):
     parsed = urlcanon.parse_url(s)
     return str(urlcanon.whatwg(parsed))
 
+
 def transform(obj):
     """
     Transforms from a single OAI-PMH object to zero or more ingest requests.
@@ -50,38 +80,43 @@ def transform(obj):
     """
 
     requests = []
-    if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+    if not obj.get("oai") or not obj["oai"].startswith("oai:"):
         return []
-    if not obj.get('urls'):
+    if not obj.get("urls"):
         return []
 
+    oai_id = obj["oai"].lower()
+    for prefix in OAI_BLOCKLIST:
+        if oai_id.startswith(prefix):
+            return []
+
     # look in obj['formats'] for PDF?
-    if obj.get('formats'):
+    if obj.get("formats"):
         # if there is a list of formats, and it does not contain PDF, then
         # skip. Note that we will continue if there is no formats list.
         has_pdf = False
-        for f in obj['formats']:
-            if 'pdf' in f.lower():
+        for f in obj["formats"]:
+            if "pdf" in f.lower():
                 has_pdf = True
         if not has_pdf:
             return []
 
     doi = None
-    if obj.get('doi'):
-        doi = obj['doi'][0].lower().strip()
-        if not doi.startswith('10.'):
+    if obj.get("doi"):
+        doi = obj["doi"][0].lower().strip()
+        if not doi.startswith("10."):
             doi = None
 
     # infer release stage and/or type from obj['types']
     release_stage = None
-    for t in obj.get('types', []):
+    for t in obj.get("types", []):
         if t in RELEASE_STAGE_MAP:
             release_stage = RELEASE_STAGE_MAP[t]
 
     # TODO: infer rel somehow? Eg, repository vs. OJS publisher
     rel = None
 
-    for url in obj['urls']:
+    for url in obj["urls"]:
         skip = False
         for domain in DOMAIN_BLOCKLIST:
             if domain in url:
@@ -94,23 +129,25 @@ def transform(obj):
             continue
 
         request = {
-            'base_url': base_url,
-            'ingest_type': 'pdf',
-            'link_source': 'oai',
-            'link_source_id': obj['oai'].lower(),
-            'ingest_request_source': 'metha-bulk',
-            'release_stage': release_stage,
-            'rel': rel,
-            'ext_ids': {
-                'doi': doi,
-                'oai': obj['oai'].lower(),
+            "base_url": base_url,
+            "ingest_type": "pdf",
+            "link_source": "oai",
+            "link_source_id": oai_id,
+            "ingest_request_source": "metha-bulk",
+            "release_stage": release_stage,
+            "rel": rel,
+            "ext_ids": {
+                "oai": obj["oai"].lower(),
             },
-            'edit_extra': {},
+            "edit_extra": {},
         }
+        if doi:
+            request["ext_ids"]["doi"] = doi
         requests.append(request)
 
     return requests
 
+
 def run(args):
     for l in args.json_file:
         if not l.strip():
@@ -121,17 +158,20 @@ def run(args):
         for r in requests:
             print("{}".format(json.dumps(r, sort_keys=True)))
 
+
 def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('json_file',
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "json_file",
         help="OAI-PMH dump file to use (usually stdin)",
-        type=argparse.FileType('r'))
+        type=argparse.FileType("r"),
+    )
     subparsers = parser.add_subparsers()
 
     args = parser.parse_args()
 
     run(args)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()