From e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 20 Jun 2019 15:38:11 -0700 Subject: petabox journal files ingest updates --- notes/petabox_ia_metadata.txt | 25 ++++++++++ python/ia_pdf_match.py | 108 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100755 python/ia_pdf_match.py diff --git a/notes/petabox_ia_metadata.txt b/notes/petabox_ia_metadata.txt index 3a99805..f46ea61 100644 --- a/notes/petabox_ia_metadata.txt +++ b/notes/petabox_ia_metadata.txt @@ -29,3 +29,28 @@ Size/results: bnewbold@ia601101$ wc -l journals-ia.20181218.pdf-sha1.tsv 1748645 journals-ia.20181218.pdf-sha1.tsv +## June 2019 Ingest + + bnewbold@ia601101$ pwd + /schnell/iamine-journals + + zcat journals-ia.20181218.json.gz | rg '"identifier": "arxiv-' > arxiv.json + zcat journals-ia.20181218.json.gz | rg '"identifier": "jstor-' > jstor.json + zcat journals-ia.20181218.json.gz | rg '"identifier": "paper-doi-10_' > paper-doi.json + zcat journals-ia.20181218.json.gz | rg '"identifier": "pubmed-PMC' > pmc.json + + cat arxiv.json | ./ia_pdf_match.py > arxiv.match.json + cat jstor.json | ./ia_pdf_match.py > jstor.match.json + cat paper-doi.json | ./ia_pdf_match.py > paper-doi.match.json + cat pmc.json | ./ia_pdf_match.py > pmc.match.json + + bnewbold@ia601101$ wc -l arxiv.*json jstor.*json paper-doi.*json pmc.*json + 1076012 arxiv.json + 740970 arxiv.match.json + 451204 jstor.json + 451204 jstor.match.json + 77838 paper-doi.json + 23736 paper-doi.match.json + 209787 pmc.json + 189093 pmc.match.json + diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py new file mode 100755 index 0000000..60b7843 --- /dev/null +++ b/python/ia_pdf_match.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +""" +Input is IA item metadata JSON. +Ouput is insertable fatcat "match" JSON + +- md5 +- sha1 +- sha256 +- size +- urls +- cdx (list; empty here) + +- dois (list) +- pmcid +- jstor_id +- arxiv_id + +When invoking import matched, be sure to: + + --default-link-rel repository (?) + --default-mimetype application/pdf +""" + +import sys +import json + +def parse(obj): + if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'): + sys.stderr.write('skip: test item\n') + return None + + extid_type = None + extid = None + if obj['metadata']['identifier'].startswith('arxiv-'): + extid_type = 'arxiv_id' + extid = obj['metadata'].get('source') + if not extid: + sys.stderr.write('skip: no source\n') + return None + assert extid.startswith('http://arxiv.org/abs/') + extid = extid.replace('http://arxiv.org/abs/', '') + #print(extid) + assert '/' in extid or '.' in extid + if not 'v' in extid or not extid[-1].isdigit(): + sys.stderr.write('skip: non-versioned arxiv_id\n') + return None + elif obj['metadata']['identifier'].startswith('paper-doi-10_'): + extid_type = 'doi' + extid = obj['metadata']['identifier-doi'] + assert extid.startswith("10.") + elif obj['metadata']['identifier'].startswith('pubmed-PMC'): + extid_type = 'pmcid' + extid = obj['metadata']['identifier'].replace('pubmed-', '') + assert extid.startswith("PMC") + int(extid[3:]) + elif obj['metadata']['identifier'].startswith('jstor-'): + extid_type = 'jstor_id' + extid = obj['metadata']['identifier'].replace('jstor-', '') + int(extid) + else: + raise NotImplementedError() + + pdf_file = None + for f in obj['files']: + if f['source'] == "original" and "PDF" in f['format']: + pdf_file = f + break + if not pdf_file: + sys.stderr.write('skip: no PDF found: {}\n'.format(obj['metadata']['identifier'])) + #for f in obj['files']: + # sys.stderr.write(f['format'] + "\n") + return None + + assert pdf_file['name'].endswith('.pdf') + + match = { + 'md5': pdf_file['md5'], + 'sha1': pdf_file['sha1'], + 'size': int(pdf_file['size']), + 'mimetype': 'application/pdf', + 'urls': [ + "https://archive.org/download/{}/{}".format( + obj['metadata']['identifier'], + pdf_file['name']), + ], + 'cdx': [], + 'dois': [], + } + + if extid_type == 'doi': + match['dois'] = [extid,] + else: + match[extid_type] = extid + + return match + +def run(): + for line in sys.stdin: + if not line: + continue + obj = json.loads(line) + match = parse(obj) + if match: + print(json.dumps(match)) + +if __name__ == '__main__': + run() -- cgit v1.2.3