diff options
Diffstat (limited to 'python/ia_pdf_match.py')
-rwxr-xr-x | python/ia_pdf_match.py | 97 |
1 files changed, 51 insertions, 46 deletions
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py index bc814de..493c9e7 100755 --- a/python/ia_pdf_match.py +++ b/python/ia_pdf_match.py @@ -1,8 +1,7 @@ #!/usr/bin/env python3 - """ Input is IA item metadata JSON. -Ouput is insertable fatcat "match" JSON +Output is insertable fatcat "match" JSON - md5 - sha1 @@ -22,87 +21,93 @@ When invoking import matched, be sure to: --default-mimetype application/pdf """ -import sys import json +import sys +from typing import Any, Dict, Optional -def parse(obj): - if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'): - sys.stderr.write('skip: test item\n') + +def parse(obj: dict) -> Optional[Dict[str, Any]]: + if obj["metadata"]["identifier"].endswith("-test") or obj["metadata"].get("test"): + print("skip: test item", file=sys.stderr) return None extid_type = None extid = None - if obj['metadata']['identifier'].startswith('arxiv-'): - extid_type = 'arxiv' - extid = obj['metadata'].get('source') + if obj["metadata"]["identifier"].startswith("arxiv-"): + extid_type = "arxiv" + extid = obj["metadata"].get("source") if not extid: - sys.stderr.write('skip: no source\n') + print("skip: no source", file=sys.stderr) return None - assert extid.startswith('http://arxiv.org/abs/') - extid = extid.replace('http://arxiv.org/abs/', '') - #print(extid) - assert '/' in extid or '.' in extid - if not 'v' in extid or not extid[-1].isdigit(): - sys.stderr.write('skip: non-versioned arxiv_id\n') + assert extid.startswith("http://arxiv.org/abs/") + extid = extid.replace("http://arxiv.org/abs/", "") + # print(extid) + assert "/" in extid or "." in extid + if "v" not in extid or not extid[-1].isdigit(): + print("skip: non-versioned arxiv_id", file=sys.stderr) return None - elif obj['metadata']['identifier'].startswith('paper-doi-10_'): - extid_type = 'doi' - extid = obj['metadata']['identifier-doi'] + elif obj["metadata"]["identifier"].startswith("paper-doi-10_"): + extid_type = "doi" + extid = obj["metadata"]["identifier-doi"] assert extid.startswith("10.") - elif obj['metadata']['identifier'].startswith('pubmed-PMC'): - extid_type = 'pmcid' - extid = obj['metadata']['identifier'].replace('pubmed-', '') + elif obj["metadata"]["identifier"].startswith("pubmed-PMC"): + extid_type = "pmcid" + extid = obj["metadata"]["identifier"].replace("pubmed-", "") assert extid.startswith("PMC") int(extid[3:]) - elif obj['metadata']['identifier'].startswith('jstor-'): - extid_type = 'jstor' - extid = obj['metadata']['identifier'].replace('jstor-', '') + elif obj["metadata"]["identifier"].startswith("jstor-"): + extid_type = "jstor" + extid = obj["metadata"]["identifier"].replace("jstor-", "") int(extid) else: raise NotImplementedError() pdf_file = None - for f in obj['files']: - if f['source'] == "original" and "PDF" in f['format']: + for f in obj["files"]: + if f["source"] == "original" and "PDF" in f["format"]: pdf_file = f break if not pdf_file: - sys.stderr.write('skip: no PDF found: {}\n'.format(obj['metadata']['identifier'])) - #for f in obj['files']: - # sys.stderr.write(f['format'] + "\n") + print("skip: no PDF found: {}".format(obj["metadata"]["identifier"]), file=sys.stderr) + # for f in obj['files']: + # print(f['format'], file=sys.stderr) return None - assert pdf_file['name'].endswith('.pdf') + assert pdf_file["name"].endswith(".pdf") match = { - 'md5': pdf_file['md5'], - 'sha1': pdf_file['sha1'], - 'size': int(pdf_file['size']), - 'mimetype': 'application/pdf', - 'urls': [ + "md5": pdf_file["md5"], + "sha1": pdf_file["sha1"], + "size": int(pdf_file["size"]), + "mimetype": "application/pdf", + "urls": [ "https://archive.org/download/{}/{}".format( - obj['metadata']['identifier'], - pdf_file['name']), + obj["metadata"]["identifier"], pdf_file["name"] + ), ], - 'cdx': [], - 'dois': [], + "cdx": [], + "dois": [], } - if extid_type == 'doi': - match['dois'] = [extid,] + if extid_type == "doi": + match["dois"] = [ + extid, + ] else: match[extid_type] = extid return match -def run(): + +def run() -> None: for line in sys.stdin: if not line: continue obj = json.loads(line) match = parse(obj) - if match: - print(json.dumps(match)) + if match is not None: + print(json.dumps(match, sort_keys=True)) + -if __name__ == '__main__': +if __name__ == "__main__": run() |