#!/usr/bin/env python3 """ Input is IA item metadata JSON. Output is insertable fatcat "match" JSON - md5 - sha1 - sha256 - size - urls - cdx (list; empty here) - dois (list) - pmcid - jstor - arxiv When invoking import matched, be sure to: --default-link-rel repository (?) --default-mimetype application/pdf """ import json import sys from typing import Any, Dict, Optional def parse(obj: dict) -> Optional[Dict[str, Any]]: if obj["metadata"]["identifier"].endswith("-test") or obj["metadata"].get("test"): print("skip: test item", file=sys.stderr) return None extid_type = None extid = None if obj["metadata"]["identifier"].startswith("arxiv-"): extid_type = "arxiv" extid = obj["metadata"].get("source") if not extid: print("skip: no source", file=sys.stderr) return None assert extid.startswith("http://arxiv.org/abs/") extid = extid.replace("http://arxiv.org/abs/", "") # print(extid) assert "/" in extid or "." in extid if "v" not in extid or not extid[-1].isdigit(): print("skip: non-versioned arxiv_id", file=sys.stderr) return None elif obj["metadata"]["identifier"].startswith("paper-doi-10_"): extid_type = "doi" extid = obj["metadata"]["identifier-doi"] assert extid.startswith("10.") elif obj["metadata"]["identifier"].startswith("pubmed-PMC"): extid_type = "pmcid" extid = obj["metadata"]["identifier"].replace("pubmed-", "") assert extid.startswith("PMC") int(extid[3:]) elif obj["metadata"]["identifier"].startswith("jstor-"): extid_type = "jstor" extid = obj["metadata"]["identifier"].replace("jstor-", "") int(extid) else: raise NotImplementedError() pdf_file = None for f in obj["files"]: if f["source"] == "original" and "PDF" in f["format"]: pdf_file = f break if not pdf_file: print("skip: no PDF found: {}".format(obj["metadata"]["identifier"]), file=sys.stderr) # for f in obj['files']: # print(f['format'], file=sys.stderr) return None assert pdf_file["name"].endswith(".pdf") match = { "md5": pdf_file["md5"], "sha1": pdf_file["sha1"], "size": int(pdf_file["size"]), "mimetype": "application/pdf", "urls": [ "https://archive.org/download/{}/{}".format( obj["metadata"]["identifier"], pdf_file["name"] ), ], "cdx": [], "dois": [], } if extid_type == "doi": match["dois"] = [ extid, ] else: match[extid_type] = extid return match def run() -> None: for line in sys.stdin: if not line: continue obj = json.loads(line) match = parse(obj) if match is not None: print(json.dumps(match, sort_keys=True)) if __name__ == "__main__": run()