aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-05 18:57:51 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-05 18:57:51 -0700
commitd60a8d6b2380a5d6599203787da59c57a8664322 (patch)
treebe2c992976ab6ef1f6fba6f3a42e902846f26e2a /python/scripts
parent285a8ad33285ece138b4cd420cbf7c854b712f43 (diff)
downloadsandcrawler-d60a8d6b2380a5d6599203787da59c57a8664322.tar.gz
sandcrawler-d60a8d6b2380a5d6599203787da59c57a8664322.zip
first iteration of oai2ingestrequest script
Diffstat (limited to 'python/scripts')
-rwxr-xr-xpython/scripts/oai2ingestrequest.py137
1 files changed, 137 insertions, 0 deletions
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
new file mode 100755
index 0000000..916f41c
--- /dev/null
+++ b/python/scripts/oai2ingestrequest.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+
+"""
+Transform an OAI-PMH bulk dump (JSON) into ingest requests.
+
+Eg: https://archive.org/details/oai_harvest_20200215
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ "semanticscholar.org/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+ "://127.0.0.1/",
+
+ # OAI specific additions
+ "://hdl.handle.net/",
+]
+
+RELEASE_STAGE_MAP = {
+ 'info:eu-repo/semantics/draftVersion': 'draft',
+ 'info:eu-repo/semantics/submittedVersion': 'submitted',
+ 'info:eu-repo/semantics/acceptedVersion': 'accepted',
+ 'info:eu-repo/semantics/publishedVersion': 'published',
+ 'info:eu-repo/semantics/updatedVersion': 'updated',
+}
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+def transform(obj):
+ """
+ Transforms from a single OAI-PMH object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+ return []
+ if not obj.get('urls'):
+ return []
+
+ # look in obj['formats'] for PDF?
+ if obj.get('formats'):
+ # if there is a list of formats, and it does not contain PDF, then
+ # skip. Note that we will continue if there is no formats list.
+ has_pdf = False
+ for f in obj['formats']:
+ if 'pdf' in f.lower():
+ has_pdf = True
+ if not has_pdf:
+ return []
+
+ doi = None
+ if obj.get('doi'):
+ doi = obj['doi'][0].lower().strip()
+ if not doi.startswith('10.'):
+ doi = None
+
+ # infer release stage and/or type from obj['types']
+ release_stage = None
+ for t in obj.get('types', []):
+ if t in RELEASE_STAGE_MAP:
+ release_stage = RELEASE_STAGE_MAP[t]
+
+ # TODO: infer rel somehow? Eg, repository vs. OJS publisher
+ rel = None
+
+ for url in obj['urls']:
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in url:
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(url)
+ except UnicodeEncodeError:
+ continue
+
+ request = {
+ 'base_url': base_url,
+ 'ingest_type': 'pdf',
+ 'link_source': 'oai',
+ 'link_source_id': obj['oai'].lower(),
+ 'ingest_request_source': 'metha-bulk',
+ 'release_stage': release_stage,
+ 'rel': rel,
+ 'ext_ids': {
+ 'doi': doi,
+ 'oai': obj['oai'].lower(),
+ },
+ 'edit_extra': {},
+ }
+ requests.append(request)
+
+ return requests
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="OAI-PMH dump file to use (usually stdin)",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()