#!/usr/bin/env python3 """ Transform an OAI-PMH bulk dump (JSON) into ingest requests. Eg: https://archive.org/details/oai_harvest_20200215 """ import argparse import json import sys import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", "ncbi.nlm.nih.gov/", "semanticscholar.org/", "://doi.org/", "://dx.doi.org/", "zenodo.org/", "figshare.com/", "://archive.org/", ".archive.org/", "://127.0.0.1/", # OAI specific additions "://hdl.handle.net/", ] RELEASE_STAGE_MAP = { "info:eu-repo/semantics/draftVersion": "draft", "info:eu-repo/semantics/submittedVersion": "submitted", "info:eu-repo/semantics/acceptedVersion": "accepted", "info:eu-repo/semantics/publishedVersion": "published", "info:eu-repo/semantics/updatedVersion": "updated", } def canon(s): parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) def transform(obj): """ Transforms from a single OAI-PMH object to zero or more ingest requests. Returns a list of dicts. """ requests = [] if not obj.get("oai") or not obj["oai"].startswith("oai:"): return [] if not obj.get("urls"): return [] # look in obj['formats'] for PDF? if obj.get("formats"): # if there is a list of formats, and it does not contain PDF, then # skip. Note that we will continue if there is no formats list. has_pdf = False for f in obj["formats"]: if "pdf" in f.lower(): has_pdf = True if not has_pdf: return [] doi = None if obj.get("doi"): doi = obj["doi"][0].lower().strip() if not doi.startswith("10."): doi = None # infer release stage and/or type from obj['types'] release_stage = None for t in obj.get("types", []): if t in RELEASE_STAGE_MAP: release_stage = RELEASE_STAGE_MAP[t] # TODO: infer rel somehow? Eg, repository vs. OJS publisher rel = None for url in obj["urls"]: skip = False for domain in DOMAIN_BLOCKLIST: if domain in url: skip = True if skip: continue try: base_url = canon(url) except UnicodeEncodeError: continue request = { "base_url": base_url, "ingest_type": "pdf", "link_source": "oai", "link_source_id": obj["oai"].lower(), "ingest_request_source": "metha-bulk", "release_stage": release_stage, "rel": rel, "ext_ids": { "doi": doi, "oai": obj["oai"].lower(), }, "edit_extra": {}, } requests.append(request) return requests def run(args): for l in args.json_file: if not l.strip(): continue row = json.loads(l) requests = transform(row) or [] for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "json_file", help="OAI-PMH dump file to use (usually stdin)", type=argparse.FileType("r"), ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) if __name__ == "__main__": main()