diff options
Diffstat (limited to 'python/ia_pdf_match.py')
-rwxr-xr-x | python/ia_pdf_match.py | 77 |
1 files changed, 39 insertions, 38 deletions
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py index ac17003..8a95d95 100755 --- a/python/ia_pdf_match.py +++ b/python/ia_pdf_match.py @@ -27,69 +27,70 @@ from typing import Any, Dict, Optional def parse(obj: dict) -> Optional[Dict[str, Any]]: - if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'): - print('skip: test item', file=sys.stderr) + if obj["metadata"]["identifier"].endswith("-test") or obj["metadata"].get("test"): + print("skip: test item", file=sys.stderr) return None extid_type = None extid = None - if obj['metadata']['identifier'].startswith('arxiv-'): - extid_type = 'arxiv' - extid = obj['metadata'].get('source') + if obj["metadata"]["identifier"].startswith("arxiv-"): + extid_type = "arxiv" + extid = obj["metadata"].get("source") if not extid: - print('skip: no source', file=sys.stderr) + print("skip: no source", file=sys.stderr) return None - assert extid.startswith('http://arxiv.org/abs/') - extid = extid.replace('http://arxiv.org/abs/', '') - #print(extid) - assert '/' in extid or '.' in extid - if 'v' not in extid or not extid[-1].isdigit(): - print('skip: non-versioned arxiv_id', file=sys.stderr) + assert extid.startswith("http://arxiv.org/abs/") + extid = extid.replace("http://arxiv.org/abs/", "") + # print(extid) + assert "/" in extid or "." in extid + if "v" not in extid or not extid[-1].isdigit(): + print("skip: non-versioned arxiv_id", file=sys.stderr) return None - elif obj['metadata']['identifier'].startswith('paper-doi-10_'): - extid_type = 'doi' - extid = obj['metadata']['identifier-doi'] + elif obj["metadata"]["identifier"].startswith("paper-doi-10_"): + extid_type = "doi" + extid = obj["metadata"]["identifier-doi"] assert extid.startswith("10.") - elif obj['metadata']['identifier'].startswith('pubmed-PMC'): - extid_type = 'pmcid' - extid = obj['metadata']['identifier'].replace('pubmed-', '') + elif obj["metadata"]["identifier"].startswith("pubmed-PMC"): + extid_type = "pmcid" + extid = obj["metadata"]["identifier"].replace("pubmed-", "") assert extid.startswith("PMC") int(extid[3:]) - elif obj['metadata']['identifier'].startswith('jstor-'): - extid_type = 'jstor' - extid = obj['metadata']['identifier'].replace('jstor-', '') + elif obj["metadata"]["identifier"].startswith("jstor-"): + extid_type = "jstor" + extid = obj["metadata"]["identifier"].replace("jstor-", "") int(extid) else: raise NotImplementedError() pdf_file = None - for f in obj['files']: - if f['source'] == "original" and "PDF" in f['format']: + for f in obj["files"]: + if f["source"] == "original" and "PDF" in f["format"]: pdf_file = f break if not pdf_file: - print('skip: no PDF found: {}'.format(obj['metadata']['identifier']), file=sys.stderr) - #for f in obj['files']: + print("skip: no PDF found: {}".format(obj["metadata"]["identifier"]), file=sys.stderr) + # for f in obj['files']: # print(f['format'], file=sys.stderr) return None - assert pdf_file['name'].endswith('.pdf') + assert pdf_file["name"].endswith(".pdf") match = { - 'md5': pdf_file['md5'], - 'sha1': pdf_file['sha1'], - 'size': int(pdf_file['size']), - 'mimetype': 'application/pdf', - 'urls': [ - "https://archive.org/download/{}/{}".format(obj['metadata']['identifier'], - pdf_file['name']), + "md5": pdf_file["md5"], + "sha1": pdf_file["sha1"], + "size": int(pdf_file["size"]), + "mimetype": "application/pdf", + "urls": [ + "https://archive.org/download/{}/{}".format( + obj["metadata"]["identifier"], pdf_file["name"] + ), ], - 'cdx': [], - 'dois': [], + "cdx": [], + "dois": [], } - if extid_type == 'doi': - match['dois'] = [ + if extid_type == "doi": + match["dois"] = [ extid, ] else: @@ -108,5 +109,5 @@ def run() -> None: print(json.dumps(match, sort_keys=True)) -if __name__ == '__main__': +if __name__ == "__main__": run() |