aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-06-20 15:38:11 -0700
committerBryan Newbold <bnewbold@archive.org>2019-06-20 15:38:11 -0700
commite0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2 (patch)
tree26ee0883820850976c321aa11d825fe5bc5622b8 /python
parent87603ba93b343c0c2b55d52c5f99697c06a672b4 (diff)
downloadsandcrawler-e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2.tar.gz
sandcrawler-e0d9aaeedc9e8b9d791a72fc8e91a4869078d6f2.zip
petabox journal files ingest updates
Diffstat (limited to 'python')
-rwxr-xr-xpython/ia_pdf_match.py108
1 files changed, 108 insertions, 0 deletions
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
new file mode 100755
index 0000000..60b7843
--- /dev/null
+++ b/python/ia_pdf_match.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+"""
+Input is IA item metadata JSON.
+Ouput is insertable fatcat "match" JSON
+
+- md5
+- sha1
+- sha256
+- size
+- urls
+- cdx (list; empty here)
+
+- dois (list)
+- pmcid
+- jstor_id
+- arxiv_id
+
+When invoking import matched, be sure to:
+
+ --default-link-rel repository (?)
+ --default-mimetype application/pdf
+"""
+
+import sys
+import json
+
+def parse(obj):
+ if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
+ sys.stderr.write('skip: test item\n')
+ return None
+
+ extid_type = None
+ extid = None
+ if obj['metadata']['identifier'].startswith('arxiv-'):
+ extid_type = 'arxiv_id'
+ extid = obj['metadata'].get('source')
+ if not extid:
+ sys.stderr.write('skip: no source\n')
+ return None
+ assert extid.startswith('http://arxiv.org/abs/')
+ extid = extid.replace('http://arxiv.org/abs/', '')
+ #print(extid)
+ assert '/' in extid or '.' in extid
+ if not 'v' in extid or not extid[-1].isdigit():
+ sys.stderr.write('skip: non-versioned arxiv_id\n')
+ return None
+ elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
+ extid_type = 'doi'
+ extid = obj['metadata']['identifier-doi']
+ assert extid.startswith("10.")
+ elif obj['metadata']['identifier'].startswith('pubmed-PMC'):
+ extid_type = 'pmcid'
+ extid = obj['metadata']['identifier'].replace('pubmed-', '')
+ assert extid.startswith("PMC")
+ int(extid[3:])
+ elif obj['metadata']['identifier'].startswith('jstor-'):
+ extid_type = 'jstor_id'
+ extid = obj['metadata']['identifier'].replace('jstor-', '')
+ int(extid)
+ else:
+ raise NotImplementedError()
+
+ pdf_file = None
+ for f in obj['files']:
+ if f['source'] == "original" and "PDF" in f['format']:
+ pdf_file = f
+ break
+ if not pdf_file:
+ sys.stderr.write('skip: no PDF found: {}\n'.format(obj['metadata']['identifier']))
+ #for f in obj['files']:
+ # sys.stderr.write(f['format'] + "\n")
+ return None
+
+ assert pdf_file['name'].endswith('.pdf')
+
+ match = {
+ 'md5': pdf_file['md5'],
+ 'sha1': pdf_file['sha1'],
+ 'size': int(pdf_file['size']),
+ 'mimetype': 'application/pdf',
+ 'urls': [
+ "https://archive.org/download/{}/{}".format(
+ obj['metadata']['identifier'],
+ pdf_file['name']),
+ ],
+ 'cdx': [],
+ 'dois': [],
+ }
+
+ if extid_type == 'doi':
+ match['dois'] = [extid,]
+ else:
+ match[extid_type] = extid
+
+ return match
+
+def run():
+ for line in sys.stdin:
+ if not line:
+ continue
+ obj = json.loads(line)
+ match = parse(obj)
+ if match:
+ print(json.dumps(match))
+
+if __name__ == '__main__':
+ run()