aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/oai2ingestrequest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/oai2ingestrequest.py')
-rwxr-xr-xpython/scripts/oai2ingestrequest.py112
1 files changed, 76 insertions, 36 deletions
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 916f41c..97c38f9 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -1,19 +1,18 @@
#!/usr/bin/env python3
-
"""
Transform an OAI-PMH bulk dump (JSON) into ingest requests.
Eg: https://archive.org/details/oai_harvest_20200215
"""
-import sys
-import json
import argparse
+import json
+import sys
+
import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
@@ -26,23 +25,54 @@ DOMAIN_BLOCKLIST = [
"://archive.org/",
".archive.org/",
"://127.0.0.1/",
-
+ "://www.kb.dk/",
+ "://kb-images.kb.dk/",
+ "://mdz-nbn-resolving.de/",
+ "://aggr.ukm.um.si/",
+ "://edoc.mpg.de/",
+ "doaj.org/",
+ "orcid.org/",
+ "://gateway.isiknowledge.com/",
# OAI specific additions
"://hdl.handle.net/",
]
+# OAI identifier prefixes for repositories that we want to skip (for various reasons)
+OAI_BLOCKLIST = [
+ "oai:kb.dk:",
+ "oai:bdr.oai.bsb-muenchen.de:",
+ "oai:hispana.mcu.es:",
+ "oai:bnf.fr:",
+ "oai:ukm.si:",
+ "oai:biodiversitylibrary.org:",
+ "oai:hsp.org:",
+ "oai:repec:",
+ "oai:n/a:",
+ "oai:quod.lib.umich.edu:",
+ "oai:americanae.aecid.es:",
+ "oai:www.irgrid.ac.cn:",
+ "oai:espace.library.uq.edu:",
+ "oai:edoc.mpg.de:",
+ "oai:bibliotecadigital.jcyl.es:",
+ "oai:repository.erciyes.edu.tr:",
+ "oai:krm.or.kr:",
+ "oai:hypotheses.org:%",
+]
+
RELEASE_STAGE_MAP = {
- 'info:eu-repo/semantics/draftVersion': 'draft',
- 'info:eu-repo/semantics/submittedVersion': 'submitted',
- 'info:eu-repo/semantics/acceptedVersion': 'accepted',
- 'info:eu-repo/semantics/publishedVersion': 'published',
- 'info:eu-repo/semantics/updatedVersion': 'updated',
+ "info:eu-repo/semantics/draftVersion": "draft",
+ "info:eu-repo/semantics/submittedVersion": "submitted",
+ "info:eu-repo/semantics/acceptedVersion": "accepted",
+ "info:eu-repo/semantics/publishedVersion": "published",
+ "info:eu-repo/semantics/updatedVersion": "updated",
}
+
def canon(s):
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj):
"""
Transforms from a single OAI-PMH object to zero or more ingest requests.
@@ -50,38 +80,43 @@ def transform(obj):
"""
requests = []
- if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+ if not obj.get("oai") or not obj["oai"].startswith("oai:"):
return []
- if not obj.get('urls'):
+ if not obj.get("urls"):
return []
+ oai_id = obj["oai"].lower()
+ for prefix in OAI_BLOCKLIST:
+ if oai_id.startswith(prefix):
+ return []
+
# look in obj['formats'] for PDF?
- if obj.get('formats'):
+ if obj.get("formats"):
# if there is a list of formats, and it does not contain PDF, then
# skip. Note that we will continue if there is no formats list.
has_pdf = False
- for f in obj['formats']:
- if 'pdf' in f.lower():
+ for f in obj["formats"]:
+ if "pdf" in f.lower():
has_pdf = True
if not has_pdf:
return []
doi = None
- if obj.get('doi'):
- doi = obj['doi'][0].lower().strip()
- if not doi.startswith('10.'):
+ if obj.get("doi"):
+ doi = obj["doi"][0].lower().strip()
+ if not doi.startswith("10."):
doi = None
# infer release stage and/or type from obj['types']
release_stage = None
- for t in obj.get('types', []):
+ for t in obj.get("types", []):
if t in RELEASE_STAGE_MAP:
release_stage = RELEASE_STAGE_MAP[t]
# TODO: infer rel somehow? Eg, repository vs. OJS publisher
rel = None
- for url in obj['urls']:
+ for url in obj["urls"]:
skip = False
for domain in DOMAIN_BLOCKLIST:
if domain in url:
@@ -94,23 +129,25 @@ def transform(obj):
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'oai',
- 'link_source_id': obj['oai'].lower(),
- 'ingest_request_source': 'metha-bulk',
- 'release_stage': release_stage,
- 'rel': rel,
- 'ext_ids': {
- 'doi': doi,
- 'oai': obj['oai'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "oai",
+ "link_source_id": oai_id,
+ "ingest_request_source": "metha-bulk",
+ "release_stage": release_stage,
+ "rel": rel,
+ "ext_ids": {
+ "oai": obj["oai"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
+ if doi:
+ request["ext_ids"]["doi"] = doi
requests.append(request)
return requests
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -121,17 +158,20 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file",
help="OAI-PMH dump file to use (usually stdin)",
- type=argparse.FileType('r'))
+ type=argparse.FileType("r"),
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()