diff options
Diffstat (limited to 'python/scripts/oai2ingestrequest.py')
-rwxr-xr-x | python/scripts/oai2ingestrequest.py | 112 |
1 files changed, 76 insertions, 36 deletions
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py index 916f41c..97c38f9 100755 --- a/python/scripts/oai2ingestrequest.py +++ b/python/scripts/oai2ingestrequest.py @@ -1,19 +1,18 @@ #!/usr/bin/env python3 - """ Transform an OAI-PMH bulk dump (JSON) into ingest requests. Eg: https://archive.org/details/oai_harvest_20200215 """ -import sys -import json import argparse +import json +import sys + import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) - # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", @@ -26,23 +25,54 @@ DOMAIN_BLOCKLIST = [ "://archive.org/", ".archive.org/", "://127.0.0.1/", - + "://www.kb.dk/", + "://kb-images.kb.dk/", + "://mdz-nbn-resolving.de/", + "://aggr.ukm.um.si/", + "://edoc.mpg.de/", + "doaj.org/", + "orcid.org/", + "://gateway.isiknowledge.com/", # OAI specific additions "://hdl.handle.net/", ] +# OAI identifier prefixes for repositories that we want to skip (for various reasons) +OAI_BLOCKLIST = [ + "oai:kb.dk:", + "oai:bdr.oai.bsb-muenchen.de:", + "oai:hispana.mcu.es:", + "oai:bnf.fr:", + "oai:ukm.si:", + "oai:biodiversitylibrary.org:", + "oai:hsp.org:", + "oai:repec:", + "oai:n/a:", + "oai:quod.lib.umich.edu:", + "oai:americanae.aecid.es:", + "oai:www.irgrid.ac.cn:", + "oai:espace.library.uq.edu:", + "oai:edoc.mpg.de:", + "oai:bibliotecadigital.jcyl.es:", + "oai:repository.erciyes.edu.tr:", + "oai:krm.or.kr:", + "oai:hypotheses.org:%", +] + RELEASE_STAGE_MAP = { - 'info:eu-repo/semantics/draftVersion': 'draft', - 'info:eu-repo/semantics/submittedVersion': 'submitted', - 'info:eu-repo/semantics/acceptedVersion': 'accepted', - 'info:eu-repo/semantics/publishedVersion': 'published', - 'info:eu-repo/semantics/updatedVersion': 'updated', + "info:eu-repo/semantics/draftVersion": "draft", + "info:eu-repo/semantics/submittedVersion": "submitted", + "info:eu-repo/semantics/acceptedVersion": "accepted", + "info:eu-repo/semantics/publishedVersion": "published", + "info:eu-repo/semantics/updatedVersion": "updated", } + def canon(s): parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) + def transform(obj): """ Transforms from a single OAI-PMH object to zero or more ingest requests. @@ -50,38 +80,43 @@ def transform(obj): """ requests = [] - if not obj.get('oai') or not obj['oai'].startswith('oai:'): + if not obj.get("oai") or not obj["oai"].startswith("oai:"): return [] - if not obj.get('urls'): + if not obj.get("urls"): return [] + oai_id = obj["oai"].lower() + for prefix in OAI_BLOCKLIST: + if oai_id.startswith(prefix): + return [] + # look in obj['formats'] for PDF? - if obj.get('formats'): + if obj.get("formats"): # if there is a list of formats, and it does not contain PDF, then # skip. Note that we will continue if there is no formats list. has_pdf = False - for f in obj['formats']: - if 'pdf' in f.lower(): + for f in obj["formats"]: + if "pdf" in f.lower(): has_pdf = True if not has_pdf: return [] doi = None - if obj.get('doi'): - doi = obj['doi'][0].lower().strip() - if not doi.startswith('10.'): + if obj.get("doi"): + doi = obj["doi"][0].lower().strip() + if not doi.startswith("10."): doi = None # infer release stage and/or type from obj['types'] release_stage = None - for t in obj.get('types', []): + for t in obj.get("types", []): if t in RELEASE_STAGE_MAP: release_stage = RELEASE_STAGE_MAP[t] # TODO: infer rel somehow? Eg, repository vs. OJS publisher rel = None - for url in obj['urls']: + for url in obj["urls"]: skip = False for domain in DOMAIN_BLOCKLIST: if domain in url: @@ -94,23 +129,25 @@ def transform(obj): continue request = { - 'base_url': base_url, - 'ingest_type': 'pdf', - 'link_source': 'oai', - 'link_source_id': obj['oai'].lower(), - 'ingest_request_source': 'metha-bulk', - 'release_stage': release_stage, - 'rel': rel, - 'ext_ids': { - 'doi': doi, - 'oai': obj['oai'].lower(), + "base_url": base_url, + "ingest_type": "pdf", + "link_source": "oai", + "link_source_id": oai_id, + "ingest_request_source": "metha-bulk", + "release_stage": release_stage, + "rel": rel, + "ext_ids": { + "oai": obj["oai"].lower(), }, - 'edit_extra': {}, + "edit_extra": {}, } + if doi: + request["ext_ids"]["doi"] = doi requests.append(request) return requests + def run(args): for l in args.json_file: if not l.strip(): @@ -121,17 +158,20 @@ def run(args): for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="OAI-PMH dump file to use (usually stdin)", - type=argparse.FileType('r')) + type=argparse.FileType("r"), + ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() |