diff options
Diffstat (limited to 'python/scripts/covid2ingestrequest.py')
-rwxr-xr-x | python/scripts/covid2ingestrequest.py | 79 |
1 files changed, 43 insertions, 36 deletions
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py index 33c425d..e3bf4f0 100755 --- a/python/scripts/covid2ingestrequest.py +++ b/python/scripts/covid2ingestrequest.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 - """ Transform an unpaywall dump (JSON) into ingest requests. """ -import sys -import json import argparse +import json +import sys + import urlcanon @@ -18,38 +18,44 @@ def canon(s): def transform_cnki(obj): requests = [] - assert obj['cnki_id'] - + assert obj["cnki_id"] requests = [] - requests.append({ - 'base_url': canon(obj['info_url']), - 'ingest_type': 'pdf', - 'link_source': 'cnki_covid19', - 'link_source_id': obj['cnki_id'], - 'ingest_request_source': 'scrape-covid19', - }) - if 'read_url' in obj: - requests.append({ - 'base_url': canon(obj['read_url']), - 'ingest_type': 'pdf', # actually HTML - 'link_source': 'cnki_covid19', - 'link_source_id': obj['cnki_id'], - 'ingest_request_source': 'scrape-covid19', - }) + requests.append( + { + "base_url": canon(obj["info_url"]), + "ingest_type": "pdf", + "link_source": "cnki_covid19", + "link_source_id": obj["cnki_id"], + "ingest_request_source": "scrape-covid19", + } + ) + if "read_url" in obj: + requests.append( + { + "base_url": canon(obj["read_url"]), + "ingest_type": "pdf", # actually HTML + "link_source": "cnki_covid19", + "link_source_id": obj["cnki_id"], + "ingest_request_source": "scrape-covid19", + } + ) return requests + def transform_wanfang(obj): - assert obj['wanfang_id'] - return [{ - 'base_url': canon(obj['url']), - 'ingest_type': 'pdf', - 'link_source': 'wanfang_covid19', - 'link_source_id': obj['wanfang_id'], - 'ingest_request_source': 'scrape-covid19', - }] + assert obj["wanfang_id"] + return [ + { + "base_url": canon(obj["url"]), + "ingest_type": "pdf", + "link_source": "wanfang_covid19", + "link_source_id": obj["wanfang_id"], + "ingest_request_source": "scrape-covid19", + } + ] def run(args): @@ -58,26 +64,27 @@ def run(args): continue row = json.loads(l) - if 'wanfang_id' in row: + if "wanfang_id" in row: requests = transform_wanfang(row) or [] - elif 'cnki_id' in row: + elif "cnki_id" in row: requests = transform_cnki(row) or [] else: continue for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="COVID-19 metadata file to use", - type=argparse.FileType('r')) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() |