diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2019-06-20 15:38:11 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2019-06-20 15:38:11 -0700 | 
| commit | e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2 (patch) | |
| tree | 26ee0883820850976c321aa11d825fe5bc5622b8 /python/ia_pdf_match.py | |
| parent | 87603ba93b343c0c2b55d52c5f99697c06a672b4 (diff) | |
| download | sandcrawler-e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2.tar.gz sandcrawler-e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2.zip | |
petabox journal files ingest updates
Diffstat (limited to 'python/ia_pdf_match.py')
| -rwxr-xr-x | python/ia_pdf_match.py | 108 | 
1 files changed, 108 insertions, 0 deletions
| diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py new file mode 100755 index 0000000..60b7843 --- /dev/null +++ b/python/ia_pdf_match.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +""" +Input is IA item metadata JSON. +Ouput is insertable fatcat "match" JSON + +- md5 +- sha1 +- sha256 +- size +- urls +- cdx (list; empty here) + +- dois (list) +- pmcid +- jstor_id +- arxiv_id + +When invoking import matched, be sure to: + +    --default-link-rel repository (?) +    --default-mimetype application/pdf +""" + +import sys +import json + +def parse(obj): +    if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'): +        sys.stderr.write('skip: test item\n') +        return None + +    extid_type = None +    extid = None +    if obj['metadata']['identifier'].startswith('arxiv-'): +        extid_type = 'arxiv_id' +        extid = obj['metadata'].get('source') +        if not extid: +            sys.stderr.write('skip: no source\n') +            return None +        assert extid.startswith('http://arxiv.org/abs/') +        extid = extid.replace('http://arxiv.org/abs/', '') +        #print(extid) +        assert '/' in extid or '.' in extid +        if not 'v' in extid or not extid[-1].isdigit(): +            sys.stderr.write('skip: non-versioned arxiv_id\n') +            return None +    elif obj['metadata']['identifier'].startswith('paper-doi-10_'): +        extid_type = 'doi' +        extid = obj['metadata']['identifier-doi'] +        assert extid.startswith("10.") +    elif obj['metadata']['identifier'].startswith('pubmed-PMC'): +        extid_type = 'pmcid' +        extid = obj['metadata']['identifier'].replace('pubmed-', '') +        assert extid.startswith("PMC") +        int(extid[3:]) +    elif obj['metadata']['identifier'].startswith('jstor-'): +        extid_type = 'jstor_id' +        extid = obj['metadata']['identifier'].replace('jstor-', '') +        int(extid) +    else: +        raise NotImplementedError() + +    pdf_file = None +    for f in obj['files']: +        if f['source'] == "original" and "PDF" in f['format']: +            pdf_file = f +            break +    if not pdf_file: +        sys.stderr.write('skip: no PDF found: {}\n'.format(obj['metadata']['identifier'])) +        #for f in obj['files']: +        #    sys.stderr.write(f['format'] + "\n") +        return None + +    assert pdf_file['name'].endswith('.pdf') + +    match = { +        'md5': pdf_file['md5'], +        'sha1': pdf_file['sha1'], +        'size': int(pdf_file['size']), +        'mimetype': 'application/pdf', +        'urls': [ +            "https://archive.org/download/{}/{}".format( +                obj['metadata']['identifier'], +                pdf_file['name']), +        ], +        'cdx': [], +        'dois': [], +    } + +    if extid_type == 'doi': +        match['dois'] = [extid,] +    else: +        match[extid_type] = extid + +    return match + +def run(): +    for line in sys.stdin: +        if not line: +            continue +        obj = json.loads(line) +        match = parse(obj) +        if match: +            print(json.dumps(match)) + +if __name__ == '__main__': +    run() | 
