1 files changed, 177 insertions, 0 deletions
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
new file mode 100755
index 0000000..97c38f9
--- /dev/null
+++ b/python/scripts/oai2ingestrequest.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""
+Transform an OAI-PMH bulk dump (JSON) into ingest requests.
+
+Eg: https://archive.org/details/oai_harvest_20200215
+"""
+
+import argparse
+import json
+import sys
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+    # large OA publishers (we get via DOI)
+    # large repos and aggregators (we crawl directly)
+    "://arxiv.org/",
+    "://europepmc.org/",
+    "ncbi.nlm.nih.gov/",
+    "semanticscholar.org/",
+    "://doi.org/",
+    "://dx.doi.org/",
+    "zenodo.org/",
+    "figshare.com/",
+    "://archive.org/",
+    ".archive.org/",
+    "://127.0.0.1/",
+    "://www.kb.dk/",
+    "://kb-images.kb.dk/",
+    "://mdz-nbn-resolving.de/",
+    "://aggr.ukm.um.si/",
+    "://edoc.mpg.de/",
+    "doaj.org/",
+    "orcid.org/",
+    "://gateway.isiknowledge.com/",
+    # OAI specific additions
+    "://hdl.handle.net/",
+]
+
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+    "oai:kb.dk:",
+    "oai:bdr.oai.bsb-muenchen.de:",
+    "oai:hispana.mcu.es:",
+    "oai:bnf.fr:",
+    "oai:ukm.si:",
+    "oai:biodiversitylibrary.org:",
+    "oai:hsp.org:",
+    "oai:repec:",
+    "oai:n/a:",
+    "oai:quod.lib.umich.edu:",
+    "oai:americanae.aecid.es:",
+    "oai:www.irgrid.ac.cn:",
+    "oai:espace.library.uq.edu:",
+    "oai:edoc.mpg.de:",
+    "oai:bibliotecadigital.jcyl.es:",
+    "oai:repository.erciyes.edu.tr:",
+    "oai:krm.or.kr:",
+    "oai:hypotheses.org:%",
+]
+
+RELEASE_STAGE_MAP = {
+    "info:eu-repo/semantics/draftVersion": "draft",
+    "info:eu-repo/semantics/submittedVersion": "submitted",
+    "info:eu-repo/semantics/acceptedVersion": "accepted",
+    "info:eu-repo/semantics/publishedVersion": "published",
+    "info:eu-repo/semantics/updatedVersion": "updated",
+}
+
+
+def canon(s):
+    parsed = urlcanon.parse_url(s)
+    return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj):
+    """
+    Transforms from a single OAI-PMH object to zero or more ingest requests.
+    Returns a list of dicts.
+    """
+
+    requests = []
+    if not obj.get("oai") or not obj["oai"].startswith("oai:"):
+        return []
+    if not obj.get("urls"):
+        return []
+
+    oai_id = obj["oai"].lower()
+    for prefix in OAI_BLOCKLIST:
+        if oai_id.startswith(prefix):
+            return []
+
+    # look in obj['formats'] for PDF?
+    if obj.get("formats"):
+        # if there is a list of formats, and it does not contain PDF, then
+        # skip. Note that we will continue if there is no formats list.
+        has_pdf = False
+        for f in obj["formats"]:
+            if "pdf" in f.lower():
+                has_pdf = True
+        if not has_pdf:
+            return []
+
+    doi = None
+    if obj.get("doi"):
+        doi = obj["doi"][0].lower().strip()
+        if not doi.startswith("10."):
+            doi = None
+
+    # infer release stage and/or type from obj['types']
+    release_stage = None
+    for t in obj.get("types", []):
+        if t in RELEASE_STAGE_MAP:
+            release_stage = RELEASE_STAGE_MAP[t]
+
+    # TODO: infer rel somehow? Eg, repository vs. OJS publisher
+    rel = None
+
+    for url in obj["urls"]:
+        skip = False
+        for domain in DOMAIN_BLOCKLIST:
+            if domain in url:
+                skip = True
+        if skip:
+            continue
+        try:
+            base_url = canon(url)
+        except UnicodeEncodeError:
+            continue
+
+        request = {
+            "base_url": base_url,
+            "ingest_type": "pdf",
+            "link_source": "oai",
+            "link_source_id": oai_id,
+            "ingest_request_source": "metha-bulk",
+            "release_stage": release_stage,
+            "rel": rel,
+            "ext_ids": {
+                "oai": obj["oai"].lower(),
+            },
+            "edit_extra": {},
+        }
+        if doi:
+            request["ext_ids"]["doi"] = doi
+        requests.append(request)
+
+    return requests
+
+
+def run(args):
+    for l in args.json_file:
+        if not l.strip():
+            continue
+        row = json.loads(l)
+
+        requests = transform(row) or []
+        for r in requests:
+            print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "json_file",
+        help="OAI-PMH dump file to use (usually stdin)",
+        type=argparse.FileType("r"),
+    )
+    subparsers = parser.add_subparsers()
+
+    args = parser.parse_args()
+
+    run(args)
+
+
+if __name__ == "__main__":
+    main()