python/ia_pdf_match.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

#!/usr/bin/env python3
"""
Input is IA item metadata JSON.
Ouput is insertable fatcat "match" JSON

- md5
- sha1
- sha256
- size
- urls
- cdx (list; empty here)

- dois (list)
- pmcid
- jstor
- arxiv

When invoking import matched, be sure to:

    --default-link-rel repository (?)
    --default-mimetype application/pdf
"""

import json
import sys
from typing import Any, Dict, Optional


def parse(obj: dict) -> Optional[Dict[str, Any]]:
    if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
        print('skip: test item', file=sys.stderr)
        return None

    extid_type = None
    extid = None
    if obj['metadata']['identifier'].startswith('arxiv-'):
        extid_type = 'arxiv'
        extid = obj['metadata'].get('source')
        if not extid:
            print('skip: no source', file=sys.stderr)
            return None
        assert extid.startswith('http://arxiv.org/abs/')
        extid = extid.replace('http://arxiv.org/abs/', '')
        #print(extid)
        assert '/' in extid or '.' in extid
        if 'v' not in extid or not extid[-1].isdigit():
            print('skip: non-versioned arxiv_id', file=sys.stderr)
            return None
    elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
        extid_type = 'doi'
        extid = obj['metadata']['identifier-doi']
        assert extid.startswith("10.")
    elif obj['metadata']['identifier'].startswith('pubmed-PMC'):
        extid_type = 'pmcid'
        extid = obj['metadata']['identifier'].replace('pubmed-', '')
        assert extid.startswith("PMC")
        int(extid[3:])
    elif obj['metadata']['identifier'].startswith('jstor-'):
        extid_type = 'jstor'
        extid = obj['metadata']['identifier'].replace('jstor-', '')
        int(extid)
    else:
        raise NotImplementedError()

    pdf_file = None
    for f in obj['files']:
        if f['source'] == "original" and "PDF" in f['format']:
            pdf_file = f
            break
    if not pdf_file:
        print('skip: no PDF found: {}'.format(obj['metadata']['identifier']), file=sys.stderr)
        #for f in obj['files']:
        #    print(f['format'], file=sys.stderr)
        return None

    assert pdf_file['name'].endswith('.pdf')

    match = {
        'md5': pdf_file['md5'],
        'sha1': pdf_file['sha1'],
        'size': int(pdf_file['size']),
        'mimetype': 'application/pdf',
        'urls': [
            "https://archive.org/download/{}/{}".format(obj['metadata']['identifier'],
                                                        pdf_file['name']),
        ],
        'cdx': [],
        'dois': [],
    }

    if extid_type == 'doi':
        match['dois'] = [
            extid,
        ]
    else:
        match[extid_type] = extid

    return match


def run() -> None:
    for line in sys.stdin:
        if not line:
            continue
        obj = json.loads(line)
        match = parse(obj)
        if match is not None:
            print(json.dumps(match, sort_keys=True))


if __name__ == '__main__':
    run()