aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/oai2ingestrequest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/oai2ingestrequest.py')
-rwxr-xr-xpython/scripts/oai2ingestrequest.py177
1 files changed, 177 insertions, 0 deletions
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
new file mode 100755
index 0000000..97c38f9
--- /dev/null
+++ b/python/scripts/oai2ingestrequest.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""
+Transform an OAI-PMH bulk dump (JSON) into ingest requests.
+
+Eg: https://archive.org/details/oai_harvest_20200215
+"""
+
+import argparse
+import json
+import sys
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ "semanticscholar.org/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+ "://127.0.0.1/",
+ "://www.kb.dk/",
+ "://kb-images.kb.dk/",
+ "://mdz-nbn-resolving.de/",
+ "://aggr.ukm.um.si/",
+ "://edoc.mpg.de/",
+ "doaj.org/",
+ "orcid.org/",
+ "://gateway.isiknowledge.com/",
+ # OAI specific additions
+ "://hdl.handle.net/",
+]
+
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+ "oai:kb.dk:",
+ "oai:bdr.oai.bsb-muenchen.de:",
+ "oai:hispana.mcu.es:",
+ "oai:bnf.fr:",
+ "oai:ukm.si:",
+ "oai:biodiversitylibrary.org:",
+ "oai:hsp.org:",
+ "oai:repec:",
+ "oai:n/a:",
+ "oai:quod.lib.umich.edu:",
+ "oai:americanae.aecid.es:",
+ "oai:www.irgrid.ac.cn:",
+ "oai:espace.library.uq.edu:",
+ "oai:edoc.mpg.de:",
+ "oai:bibliotecadigital.jcyl.es:",
+ "oai:repository.erciyes.edu.tr:",
+ "oai:krm.or.kr:",
+ "oai:hypotheses.org:%",
+]
+
+RELEASE_STAGE_MAP = {
+ "info:eu-repo/semantics/draftVersion": "draft",
+ "info:eu-repo/semantics/submittedVersion": "submitted",
+ "info:eu-repo/semantics/acceptedVersion": "accepted",
+ "info:eu-repo/semantics/publishedVersion": "published",
+ "info:eu-repo/semantics/updatedVersion": "updated",
+}
+
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj):
+ """
+ Transforms from a single OAI-PMH object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj.get("oai") or not obj["oai"].startswith("oai:"):
+ return []
+ if not obj.get("urls"):
+ return []
+
+ oai_id = obj["oai"].lower()
+ for prefix in OAI_BLOCKLIST:
+ if oai_id.startswith(prefix):
+ return []
+
+ # look in obj['formats'] for PDF?
+ if obj.get("formats"):
+ # if there is a list of formats, and it does not contain PDF, then
+ # skip. Note that we will continue if there is no formats list.
+ has_pdf = False
+ for f in obj["formats"]:
+ if "pdf" in f.lower():
+ has_pdf = True
+ if not has_pdf:
+ return []
+
+ doi = None
+ if obj.get("doi"):
+ doi = obj["doi"][0].lower().strip()
+ if not doi.startswith("10."):
+ doi = None
+
+ # infer release stage and/or type from obj['types']
+ release_stage = None
+ for t in obj.get("types", []):
+ if t in RELEASE_STAGE_MAP:
+ release_stage = RELEASE_STAGE_MAP[t]
+
+ # TODO: infer rel somehow? Eg, repository vs. OJS publisher
+ rel = None
+
+ for url in obj["urls"]:
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in url:
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(url)
+ except UnicodeEncodeError:
+ continue
+
+ request = {
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "oai",
+ "link_source_id": oai_id,
+ "ingest_request_source": "metha-bulk",
+ "release_stage": release_stage,
+ "rel": rel,
+ "ext_ids": {
+ "oai": obj["oai"].lower(),
+ },
+ "edit_extra": {},
+ }
+ if doi:
+ request["ext_ids"]["doi"] = doi
+ requests.append(request)
+
+ return requests
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file",
+ help="OAI-PMH dump file to use (usually stdin)",
+ type=argparse.FileType("r"),
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()