#!/usr/bin/env python3 """ Transform an unpaywall dump (JSON) into ingest requests. """ import sys import json import argparse import urlcanon def canon(s): parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) def transform_cnki(obj): requests = [] assert obj['cnki_id'] requests = [] requests.append({ 'base_url': canon(obj['info_url']), 'ingest_type': 'pdf', 'link_source': 'cnki_covid19', 'link_source_id': obj['cnki_id'], 'ingest_request_source': 'scrape-covid19', }) if 'read_url' in obj: requests.append({ 'base_url': canon(obj['read_url']), 'ingest_type': 'pdf', # actually HTML 'link_source': 'cnki_covid19', 'link_source_id': obj['cnki_id'], 'ingest_request_source': 'scrape-covid19', }) return requests def transform_wanfang(obj): assert obj['wanfang_id'] return [{ 'base_url': canon(obj['url']), 'ingest_type': 'pdf', 'link_source': 'wanfang_covid19', 'link_source_id': obj['wanfang_id'], 'ingest_request_source': 'scrape-covid19', }] def run(args): for l in args.json_file: if not l.strip(): continue row = json.loads(l) if 'wanfang_id' in row: requests = transform_wanfang(row) or [] elif 'cnki_id' in row: requests = transform_cnki(row) or [] else: continue for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('json_file', help="COVID-19 metadata file to use", type=argparse.FileType('r')) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) if __name__ == '__main__': main()