aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/covid2ingestrequest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/covid2ingestrequest.py')
-rwxr-xr-xpython/scripts/covid2ingestrequest.py90
1 files changed, 90 insertions, 0 deletions
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
new file mode 100755
index 0000000..e3bf4f0
--- /dev/null
+++ b/python/scripts/covid2ingestrequest.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+"""
+Transform an unpaywall dump (JSON) into ingest requests.
+"""
+
+import argparse
+import json
+import sys
+
+import urlcanon
+
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform_cnki(obj):
+
+ requests = []
+ assert obj["cnki_id"]
+
+ requests = []
+ requests.append(
+ {
+ "base_url": canon(obj["info_url"]),
+ "ingest_type": "pdf",
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
+ if "read_url" in obj:
+ requests.append(
+ {
+ "base_url": canon(obj["read_url"]),
+ "ingest_type": "pdf", # actually HTML
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
+
+ return requests
+
+
+def transform_wanfang(obj):
+
+ assert obj["wanfang_id"]
+ return [
+ {
+ "base_url": canon(obj["url"]),
+ "ingest_type": "pdf",
+ "link_source": "wanfang_covid19",
+ "link_source_id": obj["wanfang_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ ]
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ if "wanfang_id" in row:
+ requests = transform_wanfang(row) or []
+ elif "cnki_id" in row:
+ requests = transform_cnki(row) or []
+ else:
+ continue
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r")
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()