aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts')
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py72
-rwxr-xr-xpython/scripts/cdx_collection.py80
-rwxr-xr-xpython/scripts/covid2ingestrequest.py83
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py125
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py166
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py184
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py143
-rwxr-xr-xpython/scripts/enrich_scored_matches.py45
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py159
-rwxr-xr-xpython/scripts/filter_groupworks.py144
-rwxr-xr-xpython/scripts/filter_scored_matches.py116
-rwxr-xr-xpython/scripts/grobid_affiliations.py52
-rwxr-xr-xpython/scripts/import_grobid_metadata.py94
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py51
-rwxr-xr-xpython/scripts/manifest_converter.py56
-rwxr-xr-xpython/scripts/oai2ingestrequest.py137
-rwxr-xr-xpython/scripts/pdf_thumbnail.py35
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py111
18 files changed, 1853 insertions, 0 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
new file mode 100755
index 0000000..03a1f29
--- /dev/null
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+"""
+This script is intended to be used for backfill ingest of old crawls. It can
+also be used as a fast path for getting freshly crawled content into fatcat if
+the crawl was a hit and the arabesque JSON was exported conservatively.
+
+Run like:
+
+ ./arabesque2ingestrequest.py example_arabesque.json --link-source pmc --extid-type pmcid > ingest_requests.json
+
+Can then run through requests using that tool, or dump into kafka queue.
+"""
+
+import sys
+import json
+import argparse
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+ if not row['hit']:
+ continue
+
+ request = {
+ 'base_url': row['final_url'],
+ 'ingest_type': args.ingest_type,
+ 'link_source': args.link_source,
+ 'link_source_id': row['identifier'],
+ 'ingest_request_source': args.ingest_request_source,
+ 'ext_ids': {
+ args.extid_type: row['identifier'],
+ },
+ }
+ if args.release_stage:
+ assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
+ request['release_stage'] = args.release_stage
+
+ print("{}".format(json.dumps(request, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--link-source',
+ required=True,
+ help="link_source to include in request")
+ parser.add_argument('--extid-type',
+ required=True,
+ help="extid to encode identifier as")
+ parser.add_argument('--ingest-type',
+ default="pdf",
+ help="ingest type (pdf, html, xml, etc)")
+ parser.add_argument('--ingest-request-source',
+ default="arabesque",
+ help="to include in request")
+ parser.add_argument('--release-stage',
+ default=None,
+ help="to include in request")
+ parser.add_argument('json_file',
+ help="arabesque output file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
new file mode 100755
index 0000000..e867b21
--- /dev/null
+++ b/python/scripts/cdx_collection.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Fetches and merges all CDX files for a collection.
+
+Calls metadata API to enumerate all items/files, then fetches and concatanates
+them all. Requires the 'internetarchive' library.
+
+Call with a collection name:
+
+ ./cdx_collection SOME_COLLECTION_NAME
+"""
+
+import os
+import sys
+import shutil
+import tempfile
+import requests
+import subprocess
+import internetarchive as ia
+
+def run():
+
+ if len(sys.argv) != 2:
+ print("Expected a single argument (collection name)")
+ sys.exit(-1)
+
+ collection = sys.argv[1]
+
+ # Check collection name is clean
+ assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum()
+
+ tempdir = tempfile.mkdtemp()
+ print("Looking up collection: {}".format(collection))
+
+ # First fetch list
+ item_list = list(
+ ia.search_items(
+ query="collection:{} mediatype:web".format(collection)))
+
+ if len(item_list) == 0:
+ print("No items found, bailing")
+ sys.exit(-1)
+
+ print("Found {} potential items".format(len(item_list)))
+ status = True
+ errors = []
+ for item in item_list:
+ item = item['identifier']
+ # TODO: error handling
+ try:
+ ret = ia.download(item, files=[item + '.cdx.gz'],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000)
+ status = ret and status
+ except requests.exceptions.ReadTimeout as rt:
+ print(str(rt), file=sys.stderr)
+ errors.append(rt)
+ continue
+
+ if errors:
+ print("## Download Errors", file=sys.stderr)
+ for e in errors:
+ print(e, file=sys.stderr)
+
+ # Combine files
+ print("Merging and re-compressing all CDX files...")
+ #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+ subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir),
+ shell=True)
+
+ # Move and cleanup
+ shutil.move('{}/combined.gz'.format(tempdir),
+ '{}.cdx.gz'.format(collection))
+
+ print("Done!")
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
new file mode 100755
index 0000000..33c425d
--- /dev/null
+++ b/python/scripts/covid2ingestrequest.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+"""
+Transform an unpaywall dump (JSON) into ingest requests.
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform_cnki(obj):
+
+ requests = []
+ assert obj['cnki_id']
+
+
+ requests = []
+ requests.append({
+ 'base_url': canon(obj['info_url']),
+ 'ingest_type': 'pdf',
+ 'link_source': 'cnki_covid19',
+ 'link_source_id': obj['cnki_id'],
+ 'ingest_request_source': 'scrape-covid19',
+ })
+ if 'read_url' in obj:
+ requests.append({
+ 'base_url': canon(obj['read_url']),
+ 'ingest_type': 'pdf', # actually HTML
+ 'link_source': 'cnki_covid19',
+ 'link_source_id': obj['cnki_id'],
+ 'ingest_request_source': 'scrape-covid19',
+ })
+
+ return requests
+
+def transform_wanfang(obj):
+
+ assert obj['wanfang_id']
+ return [{
+ 'base_url': canon(obj['url']),
+ 'ingest_type': 'pdf',
+ 'link_source': 'wanfang_covid19',
+ 'link_source_id': obj['wanfang_id'],
+ 'ingest_request_source': 'scrape-covid19',
+ }]
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ if 'wanfang_id' in row:
+ requests = transform_wanfang(row) or []
+ elif 'cnki_id' in row:
+ requests = transform_cnki(row) or []
+ else:
+ continue
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="COVID-19 metadata file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
new file mode 100755
index 0000000..86b3b35
--- /dev/null
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Tool for bulk uploading GROBID TEI-XML output from a local filesystem dump
+(from HBase) to AWS S3.
+
+See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
+this script for that specific use-case.
+
+Script takes:
+- input TSV: `sha1_hex, json (including grobid0:tei_xml)`
+ => usually from dumpgrobid, with SHA-1 key transformed to hex, and filtered
+ down (eg, by join by SHA-1) to a specific manifest
+- AWS S3 bucket and prefix
+
+AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+
+Output:
+- errors/stats to stderr
+- log to stdout (redirect to file), prefixed by sha1
+
+Requires:
+- raven (sentry)
+- boto3 (AWS S3 client library)
+"""
+
+import os
+import sys
+import json
+import base64
+import hashlib
+import argparse
+from collections import Counter
+
+import boto3
+import raven
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+
+def b32_hex(s):
+ """copy/pasta from elsewhere"""
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+
+class DeliverDumpGrobidS3():
+
+ def __init__(self, s3_bucket, **kwargs):
+ self.rstore = None
+ self.count = Counter()
+ self.s3_bucket = s3_bucket
+ self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
+ self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
+ self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
+ self.s3 = boto3.resource('s3')
+ self.bucket = self.s3.Bucket(self.s3_bucket)
+
+ def run(self, dump_file):
+ sys.stderr.write("Starting...\n")
+ for line in dump_file:
+ line = line.strip().split('\t')
+ if len(line) != 2:
+ self.count['skip-line'] += 1
+ continue
+ sha1_hex, grobid_json = line[0], line[1]
+ if len(sha1_hex) != 40:
+ sha1_hex = b32_hex(sha1_hex)
+ assert len(sha1_hex) == 40
+ grobid = json.loads(grobid_json)
+ tei_xml = grobid.get('tei_xml')
+ if not tei_xml:
+ print("{}\tskip empty".format(sha1_hex))
+ self.count['skip-empty'] += 1
+ continue
+ tei_xml = tei_xml.encode('utf-8')
+ # upload to AWS S3
+ obj = self.bucket.put_object(
+ Key="{}{}/{}{}".format(
+ self.s3_prefix,
+ sha1_hex[0:4],
+ sha1_hex,
+ self.s3_suffix),
+ Body=tei_xml,
+ StorageClass=self.s3_storage_class,
+ )
+ print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
+ self.count['success-s3'] += 1
+ sys.stderr.write("{}\n".format(self.count))
+
+@sentry_client.capture_exceptions
+def main():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--s3-bucket',
+ required=True,
+ type=str,
+ help='AWS S3 bucket to upload into')
+ parser.add_argument('--s3-prefix',
+ type=str,
+ default="grobid/",
+ help='key prefix for items created in bucket')
+ parser.add_argument('--s3-suffix',
+ type=str,
+ default=".tei.xml",
+ help='file suffix for created objects')
+ parser.add_argument('--s3-storage-class',
+ type=str,
+ default="STANDARD",
+ help='AWS S3 storage class (redundancy) to use')
+ parser.add_argument('dump_file',
+ help="TSV/JSON dump file",
+ default=sys.stdin,
+ type=argparse.FileType('r'))
+ args = parser.parse_args()
+
+ worker = DeliverDumpGrobidS3(**args.__dict__)
+ worker.run(args.dump_file)
+
+if __name__ == '__main__': # pragma: no cover
+ main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
new file mode 100755
index 0000000..3dcf962
--- /dev/null
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Tool for bulk copying of PDFs (or other files) from GWB to local disk.
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import sys
+import json
+import base64
+import hashlib
+import argparse
+from collections import Counter
+
+import raven
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+
+class DeliverGwbDisk:
+
+ def __init__(self, disk_dir, **kwargs):
+ self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.rstore = None
+ self.count = Counter()
+ # /serve/ instead of /download/ doesn't record view count
+ self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ # gwb library will fall back to reading from /opt/.petabox/webdata.secret
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.disk_dir = disk_dir
+ self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
+ self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+
+ def fetch_warc_content(self, warc_path, offset, c_size):
+ warc_uri = self.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url))
+ try:
+ gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+ except wayback.exception.ResourceUnavailable:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
+
+ if gwb_record.get_status()[0] != 200:
+ return None, dict(status="error",
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
+
+ try:
+ raw_content = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ return None, dict(status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return raw_content, None
+
+ def run(self, manifest_file):
+ sys.stderr.write("Ensuring all 65536 base directories exist...\n")
+ for i in range(256):
+ for j in range(256):
+ fpath = "{}/{}{:02x}/{:02x}".format(
+ self.disk_dir,
+ self.disk_prefix,
+ i,
+ j)
+ os.makedirs(fpath, exist_ok=True)
+ sys.stderr.write("Starting...\n")
+ for line in manifest_file:
+ self.count['total'] += 1
+ line = line.strip().split('\t')
+ if len(line) != 2:
+ self.count['skip-line'] += 1
+ continue
+ sha1_hex, cdx_json = line[0], line[1]
+ assert len(sha1_hex) == 40
+ file_cdx = json.loads(cdx_json)
+ # If warc is not item/file.(w)arc.gz form, skip it
+ if len(file_cdx['warc'].split('/')) != 2:
+ sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
+ self.count['skip-warc'] += 1
+ continue
+ # fetch from GWB/petabox via HTTP range-request
+ blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ if blob is None and status:
+ print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+ self.count['err-petabox-fetch'] += 1
+ continue
+ elif not blob:
+ print("{}\tskip-empty-blob".format(sha1_hex))
+ self.count['skip-empty-blob'] += 1
+ continue
+ # verify sha1
+ if sha1_hex != hashlib.sha1(blob).hexdigest():
+ #assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ print("{}\terror petabox-hash-mismatch".format(sha1_hex))
+ self.count['err-petabox-hash-mismatch'] += 1
+
+ self.count['petabox-ok'] += 1
+ # save to disk
+ fpath = "{}/{}{}/{}/{}{}".format(
+ self.disk_dir,
+ self.disk_prefix,
+ sha1_hex[0:2],
+ sha1_hex[2:4],
+ sha1_hex,
+ self.disk_suffix)
+ with open(fpath, 'wb') as f:
+ f.write(blob)
+ print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
+ self.count['success-disk'] += 1
+ sys.stderr.write("{}\n".format(self.count))
+
+@sentry_client.capture_exceptions
+def main():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--disk-dir',
+ required=True,
+ type=str,
+ help='local base directory to save into')
+ parser.add_argument('--disk-prefix',
+ type=str,
+ default="pdf/",
+ help='directory prefix for items created in bucket')
+ parser.add_argument('--disk-suffix',
+ type=str,
+ default=".pdf",
+ help='file suffix for created files')
+ parser.add_argument('--warc-uri-prefix',
+ type=str,
+ default='https://archive.org/serve/',
+ help='URI where WARCs can be found')
+ parser.add_argument('manifest_file',
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType('r'))
+ args = parser.parse_args()
+
+ worker = DeliverGwbDisk(**args.__dict__)
+ worker.run(args.manifest_file)
+
+if __name__ == '__main__': # pragma: no cover
+ main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
new file mode 100755
index 0000000..39ac000
--- /dev/null
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+Tool for bulk copying of PDFs (or other files) from GWB to AWS S3.
+
+See unpaywall delivery README (in bnewbold's scratch repo) for notes on running
+this script for that specific use-case.
+
+Script takes:
+- input TSV: `sha1_hex, file:cdx (json)`
+ => usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest
+- AWS S3 bucket and prefix
+
+AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+
+GWB credentials from environment variable PETABOX_WEBDATA_SECRET, else looks in /opt/.petabox/.
+
+20x threads on a single machine can process about 340k files in 3 hours; that's
+roughly 6 hours per million per host with 32 threads, or 5k files an hour
+(1.6/second) per thread. Two large machines should be able to upload 10 million
+files in about 30 hours.
+
+Output:
+- errors/stats to stderr
+- log to stdout (redirect to file), prefixed by sha1
+
+Requires:
+- raven (sentry)
+- boto3 (AWS S3 client library)
+- wayback/GWB libraries
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import sys
+import json
+import base64
+import hashlib
+import argparse
+from collections import Counter
+
+import boto3
+import raven
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+
+class DeliverGwbS3:
+
+ def __init__(self, s3_bucket, **kwargs):
+ self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.rstore = None
+ self.count = Counter()
+ # /serve/ instead of /download/ doesn't record view count
+ self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ # gwb library will fall back to reading from /opt/.petabox/webdata.secret
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.s3_bucket = s3_bucket
+ self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
+ self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
+ self.s3 = boto3.resource('s3')
+ self.bucket = self.s3.Bucket(self.s3_bucket)
+
+ def fetch_warc_content(self, warc_path, offset, c_size):
+ warc_uri = self.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url))
+ try:
+ gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+ except wayback.exception.ResourceUnavailable:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
+
+ if gwb_record.get_status()[0] != 200:
+ return None, dict(status="error",
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
+
+ try:
+ raw_content = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ return None, dict(status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return raw_content, None
+
+ def run(self, manifest_file):
+ sys.stderr.write("Starting...\n")
+ for line in manifest_file:
+ self.count['total'] += 1
+ line = line.strip().split('\t')
+ if len(line) != 2:
+ self.count['skip-line'] += 1
+ continue
+ sha1_hex, cdx_json = line[0], line[1]
+ assert len(sha1_hex) == 40
+ file_cdx = json.loads(cdx_json)
+ # If warc is not item/file.(w)arc.gz form, skip it
+ if len(file_cdx['warc'].split('/')) != 2:
+ sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
+ self.count['skip-warc'] += 1
+ continue
+ # fetch from GWB/petabox via HTTP range-request
+ blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ if blob is None and status:
+ print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+ self.count['err-petabox-fetch'] += 1
+ continue
+ elif not blob:
+ print("{}\tskip-empty-blob".format(sha1_hex))
+ self.count['skip-empty-blob'] += 1
+ continue
+ # verify sha1
+ if sha1_hex != hashlib.sha1(blob).hexdigest():
+ #assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ print("{}\terror petabox-hash-mismatch".format(sha1_hex))
+ self.count['err-petabox-hash-mismatch'] += 1
+
+ self.count['petabox-ok'] += 1
+ # upload to AWS S3
+ obj = self.bucket.put_object(
+ Key="{}{}/{}{}".format(
+ self.s3_prefix,
+ sha1_hex[0:4],
+ sha1_hex,
+ self.s3_suffix),
+ Body=blob)
+ print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
+ self.count['success-s3'] += 1
+ sys.stderr.write("{}\n".format(self.count))
+
+@sentry_client.capture_exceptions
+def main():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--s3-bucket',
+ required=True,
+ type=str,
+ help='AWS S3 bucket to upload into')
+ parser.add_argument('--s3-prefix',
+ type=str,
+ default="pdf/",
+ help='key prefix for items created in bucket')
+ parser.add_argument('--s3-suffix',
+ type=str,
+ default=".pdf",
+ help='file suffix for created objects')
+ parser.add_argument('--warc-uri-prefix',
+ type=str,
+ default='https://archive.org/serve/',
+ help='URI where WARCs can be found')
+ parser.add_argument('manifest_file',
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType('r'))
+ args = parser.parse_args()
+
+ worker = DeliverGwbS3(**args.__dict__)
+ worker.run(args.manifest_file)
+
+if __name__ == '__main__': # pragma: no cover
+ main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
new file mode 100755
index 0000000..a7214d0
--- /dev/null
+++ b/python/scripts/doaj2ingestrequest.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+"""
+Transform an DOAJ article dump (JSON) into ingest requests.
+
+TODO: should we also attempt PDF ingest for HTML links? They seem to often be
+landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url`
+in the HTML headers and adds an ingest request on that basis. Or even just run
+the re-ingest in-process and publish a second result.
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+from typing import Optional, List
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ #"semanticscholar.org/",
+ "://doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+
+ # large publishers/platforms; may remove in the future
+ #"://link.springer.com/",
+ #"://dergipark.gov.tr/",
+ #"frontiersin.org/",
+ #"scielo",
+]
+
+# these default to PDF; note that we also do pdf ingests for HTML pages
+CONTENT_TYPE_MAP = {
+ "abstract": [],
+ "doc": [],
+ "": ["pdf"],
+
+ "doi": ["pdf"],
+ "url": ["pdf"],
+ "fulltext": ["pdf"],
+ "anySimpleType": ["pdf"],
+
+ "application/pdf": ["pdf"],
+ "html": ["html", "pdf"],
+ "text/html": ["html", "pdf"],
+ "xml": ["xml"],
+}
+
+def canon(s: str) -> str:
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+def transform(obj: dict) -> List[dict]:
+ """
+ Transforms from a single DOAJ object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ doaj_id = obj['id'].lower()
+ assert doaj_id
+
+ bibjson = obj['bibjson']
+ if not bibjson['link']:
+ return []
+
+ requests = []
+
+ doi: Optional[str] = None
+ for ident in (bibjson['identifier'] or []):
+ if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'):
+ doi = ident['id'].lower()
+
+ for link in (bibjson['link'] or []):
+ if link.get('type') != "fulltext" or not link.get('url'):
+ continue
+ ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower())
+ if not ingest_types:
+ continue
+
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in link['url'].lower():
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(link['url'].strip())
+ except UnicodeEncodeError:
+ continue
+
+ if not base_url or len(base_url) > 1000:
+ continue
+
+ for ingest_type in ingest_types:
+ request = {
+ 'base_url': base_url,
+ 'ingest_type': ingest_type,
+ 'link_source': 'doaj',
+ 'link_source_id': doaj_id,
+ 'ingest_request_source': 'doaj',
+ 'release_stage': 'published',
+ 'rel': 'publisher',
+ 'ext_ids': {
+ 'doi': doi,
+ 'doaj': doaj_id,
+ },
+ 'edit_extra': {},
+ }
+ requests.append(request)
+
+ return requests
+
+def run(args) -> None:
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main() -> None:
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="DOAJ article dump file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
new file mode 100755
index 0000000..9fe1499
--- /dev/null
+++ b/python/scripts/enrich_scored_matches.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Takes an "joined" TSV input stream:
+
+- sha1
+- dois (JSON list)
+- cdx (JSON object)
+ - url
+ - dt
+ (etc)
+- mimetype
+- size (integer)
+
+And outputs JSON objects that are can be imported into fatcat with the
+"matched" script.
+
+No dependencies (only python3 stdlib)
+"""
+
+import sys
+import json
+import base64
+
+def run():
+ for line in sys.stdin:
+ line = line.split('\t')
+ assert len(line) == 5
+ raw_sha1 = line[0].replace('sha1:', '')
+ dois = json.loads(line[1])
+ cdx = json.loads(line[2])
+ mimetype = line[3]
+ size = int(line[4])
+
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+
+ obj = dict(
+ sha1=sha1,
+ dois=dois,
+ cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ size=size,
+ mimetype=mimetype)
+ print(json.dumps(obj))
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
new file mode 100755
index 0000000..dc4bea7
--- /dev/null
+++ b/python/scripts/filter_grobid_metadata.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+
+with open('title_slug_denylist.txt', 'r') as f:
+ TITLE_DENYLIST = [l.strip() for l in f]
+
+TITLE_DENYLIST.extend((
+ 'editorial',
+ 'advertisement',
+ 'bookreviews',
+ 'reviews',
+ 'nr',
+ 'abstractoriginalarticle',
+ 'originalarticle',
+ 'impactfactor',
+ 'articlenumber',
+))
+
+# The full name can't *entirely* be one of these
+NAME_DENYLIST = (
+ 'phd',
+ 'phdstudent',
+)
+
+def tokenize(s, remove_whitespace=True):
+
+ s.replace(''', "'")
+ # Remove non-alphanumeric characters
+ s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+
+ if remove_whitespace:
+ s = ''.join(s.split())
+
+ # Encode as dumb ASCII (TODO: this is horrible)
+ return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+
+assert tokenize("Impact Factor: 2.114") == "impactfactor"
+assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
+
+def filter_title(title):
+
+ title = title.strip()
+ if len(title) > 500:
+ return None
+ title_slug = tokenize(title, remove_whitespace=True)
+ if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
+ return None
+ if title_slug.startswith('nr'):
+ return None
+ if title.lower().replace('.', '').startswith('int j '):
+ return None
+
+ for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
+ if title.startswith(prefix):
+ title.replace(prefix, '')
+
+ if title.startswith("The Journal of "):
+ return None
+
+ if "volume" in title_slug and "issue" in title_slug:
+ return None
+
+ if "downloadedfrom" in title_slug:
+ return None
+
+ if title_slug.startswith("issn"):
+ return None
+
+ # titles with too many or too few words in title
+ title_words = len(title.split())
+ if title_words > 50 or title_words < 2:
+ return None
+
+ # titles with spaces between every letter (more than N such single-char words)
+ if len([True for w in title.split() if len(w) == 1]) > 12:
+ return None
+
+ # too deep subtitling/splitting
+ if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+ return None
+
+ return title
+
+def filter_author_name(name):
+ name = name['name']
+ if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
+ return None
+ return ' '.join([t for t in name.split() if tokenize(t)])
+
+def filter_authors(l):
+ return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
+def filter_refs(l):
+ # TODO:
+ return l
+
+def filter_journal_name(name):
+ # same denylist, for now
+ if not name:
+ return None
+ name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+ slug_name = tokenize(name)
+ if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
+ return None
+ for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+ if name.startswith(prefix):
+ name = name.replace(prefix, '')
+ for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+ if name.endswith(suffix):
+ name = name.replace(suffix, '')
+ if "====================" in name:
+ return None
+ if len(name) > 150:
+ return None
+ return ' '.join(name.split())
+
+def filter_metadata(obj):
+ if not (obj.get('title') and obj.get('authors')):
+ return None
+
+ title = filter_title(obj['title'])
+ if not title:
+ #sys.stderr.write("bad title\n")
+ return None
+ else:
+ obj['title'] = title
+ obj['authors'] = filter_authors(obj['authors'])
+ obj['citations'] = filter_refs(obj['citations'])
+ obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+
+ return obj
+
+def run(invert=False):
+ for line in sys.stdin:
+ fields = line.split('\t')
+ if len(fields) == 5:
+ raw = fields[4]
+ elif len(fields) == 1:
+ raw = fields[0]
+ else:
+ sys.stderr.write("bad line\n")
+ continue
+ obj = json.loads(raw)
+ processed = filter_metadata(obj)
+ if processed:
+ if not invert:
+ processed = json.dumps(processed)
+ if len(fields) == 5:
+ fields[4] = processed
+ else:
+ fields[0] = processed
+ print('\t'.join(fields))
+ elif invert:
+ print(raw.strip())
+
+if __name__=="__main__":
+ run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
new file mode 100755
index 0000000..bbba770
--- /dev/null
+++ b/python/scripts/filter_groupworks.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Filters an input stream of sorted "groupworks" scalding job, and outputs
+"good enough" matches to be merged in fatcat.
+
+Output is JSON lines which are arrays of releases that could/should be merged
+together, either as multiple releases under a single work, or releases merged
+into a single entity (via redirects).
+
+Note that releases *should* only end up on a single line, and only once per
+line!
+
+No dependencies (only python3 stdlib)
+
+Note: the actual importer/merger should filter the following patterns out:
+- container title has "letter" and "diar"
+- contribs (authors) contain "&NA;"
+- dates differ (not just year)
+"""
+
+import sys
+import json
+
+# out of 1000
+SCORE_THRESHOLD = 900
+
+MAX_SLUG_LINES = 50
+
+REQUIRE_AUTHORS = False
+
+def tokenize(s, remove_whitespace=False):
+
+ s.replace('&apos;', "'")
+ # Remove non-alphanumeric characters
+ s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+
+ if remove_whitespace:
+ s = ''.join(s.split())
+
+ # Encode as dumb ASCII (TODO: this is horrible)
+ return s.encode('ascii', 'replace').replace(b'?', b'')
+
+def check_authors(left, right):
+ """
+ Intended to check GROBID extracted authors (right) against "known good"
+ (but maybe not perfect) Crossref metadata authors ("left").
+ """
+ if not left and not right:
+ return bool(not REQUIRE_AUTHORS)
+ if len(left) != len(right):
+ return False
+ right_all = tokenize(" ".join(right))
+ for i in range(len(left)):
+ l = left[i].lower().replace('jr.', '').split()
+ if not l:
+ return False
+ l = tokenize(l[-1])
+ if len(l) <= 1:
+ # weird author name (single char)
+ return False
+ if l not in right_all:
+ #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ return False
+ return True
+
+def test_check_authors():
+ assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
+ assert not check_authors([], ['one'])
+ assert check_authors(['one'], ['one'])
+ assert check_authors(['one two'], ['One Two'])
+ assert check_authors(['two'], ['One Two'])
+ assert check_authors(['two'], ['two, one'])
+ assert check_authors(['mago'], ['Mr. Magoo'])
+ assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
+ assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+
+# Rows are (score, left, right)
+def process_group(rows):
+
+ # first pass reduces size of list and generates a linkage graph
+ filtered = list()
+ for row in rows:
+ score = int(row[0])
+ if score < SCORE_THRESHOLD:
+ continue
+ left = json.loads(row[1])
+ right = json.loads(row[2])
+ # authors must roughly match
+ if not check_authors(left['authors'], right['authors']):
+ continue
+ # years must match (if defined)
+ if left['year'] and right['year'] and left['year'] != right['year']:
+ continue
+ filtered.append((left, right))
+
+ if not filtered:
+ return
+
+ # second pass finds a connected graph and returns that
+ releases = dict()
+ group_ids = set()
+ for row in filtered[1:]:
+ (left, right) = row
+ l_id = left['fatcat_release']
+ r_id = right['fatcat_release']
+ releases[l_id] = left
+ releases[r_id] = right
+ if not group_ids:
+ group_ids.add(l_id)
+ group_ids.add(r_id)
+ continue
+ if l_id in group_ids or r_id in group_ids:
+ group_ids.add(l_id)
+ group_ids.add(r_id)
+ continue
+
+ if not group_ids:
+ return
+
+ print(json.dumps([releases[ident] for ident in group_ids]))
+
+def run():
+
+ last_slug = None
+ lines = []
+
+ # group lines by slug, and process in batches
+ for line in sys.stdin:
+ line = line.strip().split('\t')
+ assert len(line) == 4
+ slug = line[0]
+ if last_slug and slug != last_slug and lines:
+ if len(lines) <= MAX_SLUG_LINES:
+ process_group(lines)
+ lines = []
+ last_slug = slug
+ lines.append(line[1:])
+
+ # catch any remaining
+ if lines:
+ process_group(lines)
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
new file mode 100755
index 0000000..3654b87
--- /dev/null
+++ b/python/scripts/filter_scored_matches.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Filters an input stream of sorted "matchcrossref" scalding job, and outputs
+"good enough" matches to be inserted to fatcat.
+
+Currently works on DOI numbers. Filters for a high enough string match (doesn't
+re-do title match), and checks author lists. Filters out slugs with too many
+matches, and outputs one-line-per-sha1 (aka, file).
+
+No dependencies (only python3 stdlib)
+"""
+
+import sys
+import json
+
+# out of 1000
+score_threshold = 900
+
+max_slug_lines = 10
+
+require_authors = 1
+
+
+def tokenize(s, remove_whitespace=False):
+
+ s.replace('&apos;', "'")
+ # Remove non-alphanumeric characters
+ s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+
+ if remove_whitespace:
+ s = ''.join(s.split())
+
+ # Encode as dumb ASCII (TODO: this is horrible)
+ return s.encode('ascii', 'replace').replace(b'?', b'')
+
+def check_authors(left, right):
+ """
+ Intended to check GROBID extracted authors (right) against "known good"
+ (but maybe not perfect) Crossref metadata authors ("left").
+ """
+ if not left:
+ return False
+ if len(left) > len(right):
+ return False
+ right_all = tokenize(" ".join(right))
+ for i in range(len(left)):
+ l = left[i].lower().replace('jr.', '').split()
+ if not l:
+ return False
+ l = tokenize(l[-1])
+ if len(l) <= 1:
+ # weird author name (single char)
+ return False
+ if l not in right_all:
+ #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ return False
+ return True
+
+def test_check_authors():
+ assert not check_authors([], [])
+ assert not check_authors([], ['one'])
+ assert check_authors(['one'], ['one'])
+ assert check_authors(['one two'], ['One Two'])
+ assert check_authors(['two'], ['One Two'])
+ assert check_authors(['two'], ['two, one'])
+ assert check_authors(['mago'], ['Mr. Magoo'])
+ assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
+ assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+
+# Rows are (score, grobid, crossref)
+def process_group(rows):
+ if len(rows) > max_slug_lines:
+ return
+ keepers = dict()
+ for row in rows:
+ score = int(row[0])
+ if score < score_threshold:
+ continue
+ grobid = json.loads(row[1])
+ crossref = json.loads(row[2])
+ if not check_authors(crossref['authors'], grobid['authors']):
+ #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+ continue
+ else:
+ #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+ pass
+ sha1 = grobid['sha1']
+ doi = crossref['doi'].lower()
+ l = keepers.get(sha1, list())
+ l.append(doi)
+ keepers[sha1] = l
+ for sha1, doi_list in keepers.items():
+ print("{}\t{}".format(sha1, json.dumps(doi_list)))
+
+def run():
+
+ last_slug = None
+ lines = []
+
+ # group lines by slug, and process in batches
+ for line in sys.stdin:
+ line = line.strip().split('\t')
+ assert len(line) == 4
+ slug = line[0]
+ if last_slug and slug != last_slug and lines:
+ process_group(lines)
+ lines = []
+ last_slug = slug
+ lines.append(line[1:])
+
+ # catch any remaining
+ if lines:
+ process_group(lines)
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
new file mode 100755
index 0000000..79feac1
--- /dev/null
+++ b/python/scripts/grobid_affiliations.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+"""
+Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
+output, converts the XML to JSON, filters out raw affiliation strings, and
+dumps these as JSON subset.
+
+Run in bulk like:
+
+ ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
+"""
+
+import sys
+import json
+
+from grobid2json import teixml2json
+
+def parse_hbase(line):
+ line = line.split('\t')
+ assert len(line) == 2
+ sha1hex = line[0]
+ obj = json.loads(line[1])
+ tei_xml = obj['tei_xml']
+ return sha1hex, tei_xml
+
+def parse_pg(line):
+ obj = json.loads(line)
+ return obj['sha1hex'], obj['tei_xml']
+
+def run(mode='hbase'):
+ for line in sys.stdin:
+ if mode == 'hbase':
+ sha1hex, tei_xml = parse_hbase(line)
+ elif mode == 'pg':
+ sha1hex, tei_xml = parse_pg(line)
+ else:
+ raise NotImplementedError('parse mode: {}'.format(mode))
+
+ obj = teixml2json(tei_xml, encumbered=False)
+
+ affiliations = []
+ for author in obj['authors']:
+ if author.get('affiliation'):
+ affiliations.append(author['affiliation'])
+ if affiliations:
+ # don't duplicate affiliations; only the unique ones
+ affiliations = list(set([json.dumps(a) for a in affiliations]))
+ affiliations = [json.loads(a) for a in affiliations]
+ print('\t'.join([sha1hex, json.dumps(affiliations)]))
+
+if __name__=='__main__':
+ run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
new file mode 100755
index 0000000..d01b526
--- /dev/null
+++ b/python/scripts/import_grobid_metadata.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+import datetime
+
+MAX_ABSTRACT_BYTES=4096
+
+def parse_grobid_json(obj):
+
+ if not obj.get('title'):
+ return None
+
+ extra = dict()
+
+ if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
+ abobj = dict(
+ mimetype="text/plain",
+ language=None,
+ content=obj.get('abstract').strip())
+ abstracts = [abobj]
+ else:
+ abstracts = None
+
+ contribs = []
+ for a in obj.get('authors', []):
+ c = dict(raw_name=a, role="author")
+ contribs.append(c)
+
+ refs = []
+ for raw in obj.get('citations', []):
+ extra = dict()
+ ref = dict()
+ ref['key'] = raw.get('id')
+ if raw.get('title'):
+ ref['title'] = raw['title'].strip()
+ if raw.get('date'):
+ try:
+ year = int(raw['date'].strip()[:4])
+ ref['year'] = year
+ except:
+ pass
+ for key in ('volume', 'url', 'issue', 'publisher'):
+ if raw.get(key):
+ extra[key] = raw[key].strip()
+ if raw.get('authors'):
+ extra['authors'] = [a['name'] for a in raw['authors']]
+ if extra:
+ extra = dict(grobid=extra)
+ else:
+ extra = None
+ ref['extra'] = extra
+ refs.append(ref)
+
+ release_type = "journal-article"
+ release_date = None
+ if obj.get('date'):
+ # TODO: only returns year, ever? how to handle?
+ release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+
+ if obj.get('doi'):
+ extra['doi'] = obj['doi'].lower()
+ if obj['journal'].get('name'):
+ extra['container_name'] = obj['journal']['name']
+
+ extra['is_longtail_oa'] = True
+
+ # TODO: ISSN/eISSN handling? or just journal name lookup?
+
+ if extra:
+ extra = dict(grobid=extra)
+ else:
+ extra = None
+
+ return dict(
+ title=obj['title'].strip(),
+ contribs=contribs,
+ publisher=obj['journal'].get('publisher'),
+ volume=obj['journal'].get('volume'),
+ issue=obj['journal'].get('issue'),
+ abstracts=abstracts,
+ release_type=release_type,
+ release_date=release_date,
+ extra=extra)
+
+def run():
+ for line in sys.stdin:
+ obj = json.loads(line)
+ out = parse_grobid_json(obj)
+ if out:
+ print(out)
+
+if __name__=="__main__":
+ run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
new file mode 100755
index 0000000..494ec7a
--- /dev/null
+++ b/python/scripts/ingestrequest_row2json.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+"""
+This script is used to turn ingest request postgres rows (in JSON export
+format) back in to regular ingest request JSON.
+
+The only difference is the name and location of some optional keys.
+"""
+
+import sys
+import json
+import argparse
+
+
+def transform(row):
+ """
+ dict-to-dict
+ """
+ row.pop('created', None)
+ extra = row.pop('request', None) or {}
+ for k in ('ext_ids', 'edit_extra'):
+ if k in extra:
+ row[k] = extra[k]
+ if 'release_ident' in extra:
+ row['fatcat'] = dict(release_ident=extra['release_ident'])
+ return row
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ try:
+ req = transform(json.loads(l))
+ except:
+ print(l, file=sys.stderr)
+ print(json.dumps(req, sort_keys=True))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="arabesque output file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
new file mode 100755
index 0000000..35cee5b
--- /dev/null
+++ b/python/scripts/manifest_converter.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""
+Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of
+"match" JSON objects which can be imported into fatcat with matched_import.py
+
+This was used to convert this manifest:
+
+ https://archive.org/details/ia_papers_manifest_2018-01-25/
+
+to JSON format for fast fatcat importing.
+"""
+
+import sys
+import json
+import sqlite3
+
+# iterate over rows in files metadata...
+# 1. select all identified DOIs
+# => filter based on count
+# 2. select all file metadata
+# 3. output object
+
+def or_none(s):
+ if s is None:
+ return None
+ elif type(s) == str and ((not s) or s == "\\N" or s == "-"):
+ return None
+ return s
+
+def process_db(db_path):
+
+ db = sqlite3.connect(db_path)
+
+ for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"):
+ sha1 = row[0]
+ dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall()
+ dois = [d[0] for d in dois]
+ if not dois:
+ continue
+ urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall()
+ if not urls:
+ continue
+ cdx = [dict(url=row[0], dt=row[1]) for row in urls]
+ obj = dict(
+ sha1=sha1,
+ mimetype=or_none(row[1]),
+ size=(or_none(row[2]) and int(row[2])),
+ md5=or_none(row[3]),
+ dois=dois,
+ cdx=cdx,
+ )
+ dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
+ print(json.dumps(obj))
+
+if __name__=="__main__":
+ process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
new file mode 100755
index 0000000..916f41c
--- /dev/null
+++ b/python/scripts/oai2ingestrequest.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+
+"""
+Transform an OAI-PMH bulk dump (JSON) into ingest requests.
+
+Eg: https://archive.org/details/oai_harvest_20200215
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ "semanticscholar.org/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+ "://127.0.0.1/",
+
+ # OAI specific additions
+ "://hdl.handle.net/",
+]
+
+RELEASE_STAGE_MAP = {
+ 'info:eu-repo/semantics/draftVersion': 'draft',
+ 'info:eu-repo/semantics/submittedVersion': 'submitted',
+ 'info:eu-repo/semantics/acceptedVersion': 'accepted',
+ 'info:eu-repo/semantics/publishedVersion': 'published',
+ 'info:eu-repo/semantics/updatedVersion': 'updated',
+}
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+def transform(obj):
+ """
+ Transforms from a single OAI-PMH object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+ return []
+ if not obj.get('urls'):
+ return []
+
+ # look in obj['formats'] for PDF?
+ if obj.get('formats'):
+ # if there is a list of formats, and it does not contain PDF, then
+ # skip. Note that we will continue if there is no formats list.
+ has_pdf = False
+ for f in obj['formats']:
+ if 'pdf' in f.lower():
+ has_pdf = True
+ if not has_pdf:
+ return []
+
+ doi = None
+ if obj.get('doi'):
+ doi = obj['doi'][0].lower().strip()
+ if not doi.startswith('10.'):
+ doi = None
+
+ # infer release stage and/or type from obj['types']
+ release_stage = None
+ for t in obj.get('types', []):
+ if t in RELEASE_STAGE_MAP:
+ release_stage = RELEASE_STAGE_MAP[t]
+
+ # TODO: infer rel somehow? Eg, repository vs. OJS publisher
+ rel = None
+
+ for url in obj['urls']:
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in url:
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(url)
+ except UnicodeEncodeError:
+ continue
+
+ request = {
+ 'base_url': base_url,
+ 'ingest_type': 'pdf',
+ 'link_source': 'oai',
+ 'link_source_id': obj['oai'].lower(),
+ 'ingest_request_source': 'metha-bulk',
+ 'release_stage': release_stage,
+ 'rel': rel,
+ 'ext_ids': {
+ 'doi': doi,
+ 'oai': obj['oai'].lower(),
+ },
+ 'edit_extra': {},
+ }
+ requests.append(request)
+
+ return requests
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="OAI-PMH dump file to use (usually stdin)",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
new file mode 100755
index 0000000..af08db6
--- /dev/null
+++ b/python/scripts/pdf_thumbnail.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+"""
+Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
+
+Originally used to benchmark and compare file size/quality.
+"""
+
+import sys
+import poppler
+from PIL import Image
+
+
+def run(inpath, outpath):
+
+ try:
+ pdf = poppler.load_from_file(inpath)
+ page = pdf.create_page(0)
+ except Exception as e:
+ print(str(e), file=sys.stderr)
+ sys.exit(0)
+
+ renderer = poppler.PageRenderer()
+ full_page = renderer.render_page(page)
+ img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1)
+ img.thumbnail((180,300), Image.BICUBIC)
+ #img.thumbnail((360,600), Image.BICUBIC)
+ img.save(outpath)
+ #img.save(outpath, quality=95)
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
+ sys.exit(-1)
+ run(sys.argv[1], sys.argv[2])
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
new file mode 100755
index 0000000..5536e6c
--- /dev/null
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+"""
+Transform an unpaywall dump (JSON) into ingest requests.
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ "semanticscholar.org/",
+ "://doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+]
+
+RELEASE_STAGE_MAP = {
+ 'draftVersion': 'draft',
+ 'submittedVersion': 'submitted',
+ 'acceptedVersion': 'accepted',
+ 'publishedVersion': 'published',
+ 'updatedVersion': 'updated',
+}
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+def transform(obj):
+ """
+ Transforms from a single unpaywall object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj['doi'].startswith('10.'):
+ return requests
+ if not obj['oa_locations']:
+ return requests
+
+ for location in obj['oa_locations']:
+ if not location['url_for_pdf']:
+ continue
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in location['url_for_pdf']:
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(location['url_for_pdf'])
+ except UnicodeEncodeError:
+ continue
+
+ request = {
+ 'base_url': base_url,
+ 'ingest_type': 'pdf',
+ 'link_source': 'unpaywall',
+ 'link_source_id': obj['doi'].lower(),
+ 'ingest_request_source': 'unpaywall',
+ 'release_stage': RELEASE_STAGE_MAP.get(location['version']),
+ 'rel': location['host_type'],
+ 'ext_ids': {
+ 'doi': obj['doi'].lower(),
+ },
+ 'edit_extra': {},
+ }
+ if obj.get('oa_status'):
+ request['edit_extra']['oa_status'] = obj['oa_status']
+ if location.get('evidence'):
+ request['edit_extra']['evidence'] = location['evidence']
+ if location['pmh_id']:
+ request['ext_ids']['pmh_id'] = location['pmh_id']
+ requests.append(request)
+
+ return requests
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="unpaywall dump file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()