From e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Thu, 20 Jun 2019 15:38:11 -0700
Subject: petabox journal files ingest updates

---
 notes/petabox_ia_metadata.txt |  25 ++++++++++
 python/ia_pdf_match.py        | 108 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)
 create mode 100755 python/ia_pdf_match.py

diff --git a/notes/petabox_ia_metadata.txt b/notes/petabox_ia_metadata.txt
index 3a99805..f46ea61 100644
--- a/notes/petabox_ia_metadata.txt
+++ b/notes/petabox_ia_metadata.txt
@@ -29,3 +29,28 @@ Size/results:
     bnewbold@ia601101$ wc -l journals-ia.20181218.pdf-sha1.tsv 
     1748645 journals-ia.20181218.pdf-sha1.tsv
 
+## June 2019 Ingest
+
+    bnewbold@ia601101$ pwd
+    /schnell/iamine-journals
+
+    zcat journals-ia.20181218.json.gz | rg '"identifier": "arxiv-' > arxiv.json
+    zcat journals-ia.20181218.json.gz | rg '"identifier": "jstor-' > jstor.json
+    zcat journals-ia.20181218.json.gz | rg '"identifier": "paper-doi-10_' > paper-doi.json
+    zcat journals-ia.20181218.json.gz | rg '"identifier": "pubmed-PMC' > pmc.json
+
+    cat arxiv.json | ./ia_pdf_match.py > arxiv.match.json
+    cat jstor.json | ./ia_pdf_match.py > jstor.match.json
+    cat paper-doi.json | ./ia_pdf_match.py > paper-doi.match.json
+    cat pmc.json | ./ia_pdf_match.py > pmc.match.json
+
+    bnewbold@ia601101$ wc -l arxiv.*json jstor.*json paper-doi.*json pmc.*json 
+        1076012 arxiv.json
+         740970 arxiv.match.json
+         451204 jstor.json
+         451204 jstor.match.json
+          77838 paper-doi.json
+          23736 paper-doi.match.json
+         209787 pmc.json
+         189093 pmc.match.json
+
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
new file mode 100755
index 0000000..60b7843
--- /dev/null
+++ b/python/ia_pdf_match.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+"""
+Input is IA item metadata JSON.
+Ouput is insertable fatcat "match" JSON
+
+- md5
+- sha1
+- sha256
+- size
+- urls
+- cdx (list; empty here)
+
+- dois (list)
+- pmcid
+- jstor_id
+- arxiv_id
+
+When invoking import matched, be sure to:
+
+    --default-link-rel repository (?)
+    --default-mimetype application/pdf
+"""
+
+import sys
+import json
+
+def parse(obj):
+    if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
+        sys.stderr.write('skip: test item\n')
+        return None
+
+    extid_type = None
+    extid = None
+    if obj['metadata']['identifier'].startswith('arxiv-'):
+        extid_type = 'arxiv_id'
+        extid = obj['metadata'].get('source')
+        if not extid:
+            sys.stderr.write('skip: no source\n')
+            return None
+        assert extid.startswith('http://arxiv.org/abs/')
+        extid = extid.replace('http://arxiv.org/abs/', '')
+        #print(extid)
+        assert '/' in extid or '.' in extid
+        if not 'v' in extid or not extid[-1].isdigit():
+            sys.stderr.write('skip: non-versioned arxiv_id\n')
+            return None
+    elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
+        extid_type = 'doi'
+        extid = obj['metadata']['identifier-doi']
+        assert extid.startswith("10.")
+    elif obj['metadata']['identifier'].startswith('pubmed-PMC'):
+        extid_type = 'pmcid'
+        extid = obj['metadata']['identifier'].replace('pubmed-', '')
+        assert extid.startswith("PMC")
+        int(extid[3:])
+    elif obj['metadata']['identifier'].startswith('jstor-'):
+        extid_type = 'jstor_id'
+        extid = obj['metadata']['identifier'].replace('jstor-', '')
+        int(extid)
+    else:
+        raise NotImplementedError()
+
+    pdf_file = None
+    for f in obj['files']:
+        if f['source'] == "original" and "PDF" in f['format']:
+            pdf_file = f
+            break
+    if not pdf_file:
+        sys.stderr.write('skip: no PDF found: {}\n'.format(obj['metadata']['identifier']))
+        #for f in obj['files']:
+        #    sys.stderr.write(f['format'] + "\n")
+        return None
+
+    assert pdf_file['name'].endswith('.pdf')
+
+    match = {
+        'md5': pdf_file['md5'],
+        'sha1': pdf_file['sha1'],
+        'size': int(pdf_file['size']),
+        'mimetype': 'application/pdf',
+        'urls': [
+            "https://archive.org/download/{}/{}".format(
+                obj['metadata']['identifier'],
+                pdf_file['name']),
+        ],
+        'cdx': [],
+        'dois': [],
+    }
+
+    if extid_type == 'doi':
+        match['dois'] = [extid,]
+    else:
+        match[extid_type] = extid
+
+    return match
+
+def run():
+    for line in sys.stdin:
+        if not line:
+            continue
+        obj = json.loads(line)
+        match = parse(obj)
+        if match:
+            print(json.dumps(match))
+
+if __name__ == '__main__':
+    run()
-- 
cgit v1.2.3