diff options
Diffstat (limited to 'python/scripts/oai2ingestrequest.py')
-rwxr-xr-x | python/scripts/oai2ingestrequest.py | 177 |
1 files changed, 177 insertions, 0 deletions
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py new file mode 100755 index 0000000..97c38f9 --- /dev/null +++ b/python/scripts/oai2ingestrequest.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +""" +Transform an OAI-PMH bulk dump (JSON) into ingest requests. + +Eg: https://archive.org/details/oai_harvest_20200215 +""" + +import argparse +import json +import sys + +import urlcanon + +DOMAIN_BLOCKLIST = [ + # large OA publishers (we get via DOI) + # large repos and aggregators (we crawl directly) + "://arxiv.org/", + "://europepmc.org/", + "ncbi.nlm.nih.gov/", + "semanticscholar.org/", + "://doi.org/", + "://dx.doi.org/", + "zenodo.org/", + "figshare.com/", + "://archive.org/", + ".archive.org/", + "://127.0.0.1/", + "://www.kb.dk/", + "://kb-images.kb.dk/", + "://mdz-nbn-resolving.de/", + "://aggr.ukm.um.si/", + "://edoc.mpg.de/", + "doaj.org/", + "orcid.org/", + "://gateway.isiknowledge.com/", + # OAI specific additions + "://hdl.handle.net/", +] + +# OAI identifier prefixes for repositories that we want to skip (for various reasons) +OAI_BLOCKLIST = [ + "oai:kb.dk:", + "oai:bdr.oai.bsb-muenchen.de:", + "oai:hispana.mcu.es:", + "oai:bnf.fr:", + "oai:ukm.si:", + "oai:biodiversitylibrary.org:", + "oai:hsp.org:", + "oai:repec:", + "oai:n/a:", + "oai:quod.lib.umich.edu:", + "oai:americanae.aecid.es:", + "oai:www.irgrid.ac.cn:", + "oai:espace.library.uq.edu:", + "oai:edoc.mpg.de:", + "oai:bibliotecadigital.jcyl.es:", + "oai:repository.erciyes.edu.tr:", + "oai:krm.or.kr:", + "oai:hypotheses.org:%", +] + +RELEASE_STAGE_MAP = { + "info:eu-repo/semantics/draftVersion": "draft", + "info:eu-repo/semantics/submittedVersion": "submitted", + "info:eu-repo/semantics/acceptedVersion": "accepted", + "info:eu-repo/semantics/publishedVersion": "published", + "info:eu-repo/semantics/updatedVersion": "updated", +} + + +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) + + +def transform(obj): + """ + Transforms from a single OAI-PMH object to zero or more ingest requests. + Returns a list of dicts. + """ + + requests = [] + if not obj.get("oai") or not obj["oai"].startswith("oai:"): + return [] + if not obj.get("urls"): + return [] + + oai_id = obj["oai"].lower() + for prefix in OAI_BLOCKLIST: + if oai_id.startswith(prefix): + return [] + + # look in obj['formats'] for PDF? + if obj.get("formats"): + # if there is a list of formats, and it does not contain PDF, then + # skip. Note that we will continue if there is no formats list. + has_pdf = False + for f in obj["formats"]: + if "pdf" in f.lower(): + has_pdf = True + if not has_pdf: + return [] + + doi = None + if obj.get("doi"): + doi = obj["doi"][0].lower().strip() + if not doi.startswith("10."): + doi = None + + # infer release stage and/or type from obj['types'] + release_stage = None + for t in obj.get("types", []): + if t in RELEASE_STAGE_MAP: + release_stage = RELEASE_STAGE_MAP[t] + + # TODO: infer rel somehow? Eg, repository vs. OJS publisher + rel = None + + for url in obj["urls"]: + skip = False + for domain in DOMAIN_BLOCKLIST: + if domain in url: + skip = True + if skip: + continue + try: + base_url = canon(url) + except UnicodeEncodeError: + continue + + request = { + "base_url": base_url, + "ingest_type": "pdf", + "link_source": "oai", + "link_source_id": oai_id, + "ingest_request_source": "metha-bulk", + "release_stage": release_stage, + "rel": rel, + "ext_ids": { + "oai": obj["oai"].lower(), + }, + "edit_extra": {}, + } + if doi: + request["ext_ids"]["doi"] = doi + requests.append(request) + + return requests + + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + requests = transform(row) or [] + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", + help="OAI-PMH dump file to use (usually stdin)", + type=argparse.FileType("r"), + ) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + + +if __name__ == "__main__": + main() |