diff options
Diffstat (limited to 'python/scripts/unpaywall2ingestrequest.py')
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 111 |
1 files changed, 111 insertions, 0 deletions
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py new file mode 100755 index 0000000..5536e6c --- /dev/null +++ b/python/scripts/unpaywall2ingestrequest.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +""" +Transform an unpaywall dump (JSON) into ingest requests. +""" + +import sys +import json +import argparse +import urlcanon + +DOMAIN_BLOCKLIST = [ + # large OA publishers (we get via DOI) + + # large repos and aggregators (we crawl directly) + "://arxiv.org/", + "://europepmc.org/", + "ncbi.nlm.nih.gov/", + "semanticscholar.org/", + "://doi.org/", + "zenodo.org/", + "figshare.com/", + "://archive.org/", + ".archive.org/", +] + +RELEASE_STAGE_MAP = { + 'draftVersion': 'draft', + 'submittedVersion': 'submitted', + 'acceptedVersion': 'accepted', + 'publishedVersion': 'published', + 'updatedVersion': 'updated', +} + +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) + +def transform(obj): + """ + Transforms from a single unpaywall object to zero or more ingest requests. + Returns a list of dicts. + """ + + requests = [] + if not obj['doi'].startswith('10.'): + return requests + if not obj['oa_locations']: + return requests + + for location in obj['oa_locations']: + if not location['url_for_pdf']: + continue + skip = False + for domain in DOMAIN_BLOCKLIST: + if domain in location['url_for_pdf']: + skip = True + if skip: + continue + try: + base_url = canon(location['url_for_pdf']) + except UnicodeEncodeError: + continue + + request = { + 'base_url': base_url, + 'ingest_type': 'pdf', + 'link_source': 'unpaywall', + 'link_source_id': obj['doi'].lower(), + 'ingest_request_source': 'unpaywall', + 'release_stage': RELEASE_STAGE_MAP.get(location['version']), + 'rel': location['host_type'], + 'ext_ids': { + 'doi': obj['doi'].lower(), + }, + 'edit_extra': {}, + } + if obj.get('oa_status'): + request['edit_extra']['oa_status'] = obj['oa_status'] + if location.get('evidence'): + request['edit_extra']['evidence'] = location['evidence'] + if location['pmh_id']: + request['ext_ids']['pmh_id'] = location['pmh_id'] + requests.append(request) + + return requests + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + requests = transform(row) or [] + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('json_file', + help="unpaywall dump file to use", + type=argparse.FileType('r')) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + +if __name__ == '__main__': + main() |