diff options
Diffstat (limited to 'python/scripts')
| -rwxr-xr-x | python/scripts/arabesque2ingestrequest.py | 72 | ||||
| -rwxr-xr-x | python/scripts/cdx_collection.py | 80 | ||||
| -rwxr-xr-x | python/scripts/covid2ingestrequest.py | 83 | ||||
| -rwxr-xr-x | python/scripts/deliver_dumpgrobid_to_s3.py | 125 | ||||
| -rwxr-xr-x | python/scripts/deliver_gwb_to_disk.py | 166 | ||||
| -rwxr-xr-x | python/scripts/deliver_gwb_to_s3.py | 184 | ||||
| -rwxr-xr-x | python/scripts/doaj2ingestrequest.py | 143 | ||||
| -rwxr-xr-x | python/scripts/enrich_scored_matches.py | 45 | ||||
| -rwxr-xr-x | python/scripts/filter_grobid_metadata.py | 159 | ||||
| -rwxr-xr-x | python/scripts/filter_groupworks.py | 144 | ||||
| -rwxr-xr-x | python/scripts/filter_scored_matches.py | 116 | ||||
| -rwxr-xr-x | python/scripts/grobid_affiliations.py | 52 | ||||
| -rwxr-xr-x | python/scripts/import_grobid_metadata.py | 94 | ||||
| -rwxr-xr-x | python/scripts/ingestrequest_row2json.py | 51 | ||||
| -rwxr-xr-x | python/scripts/manifest_converter.py | 56 | ||||
| -rwxr-xr-x | python/scripts/oai2ingestrequest.py | 137 | ||||
| -rwxr-xr-x | python/scripts/pdf_thumbnail.py | 35 | ||||
| -rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 111 | 
18 files changed, 1853 insertions, 0 deletions
| diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py new file mode 100755 index 0000000..03a1f29 --- /dev/null +++ b/python/scripts/arabesque2ingestrequest.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +""" +This script is intended to be used for backfill ingest of old crawls. It can +also be used as a fast path for getting freshly crawled content into fatcat if +the crawl was a hit and the arabesque JSON was exported conservatively. + +Run like: + +    ./arabesque2ingestrequest.py example_arabesque.json --link-source pmc --extid-type pmcid > ingest_requests.json + +Can then run through requests using that tool, or dump into kafka queue. +""" + +import sys +import json +import argparse + + +def run(args): +    for l in args.json_file: +        if not l.strip(): +            continue +        row = json.loads(l) +        if not row['hit']: +            continue + +        request = { +            'base_url': row['final_url'], +            'ingest_type': args.ingest_type, +            'link_source': args.link_source, +            'link_source_id': row['identifier'], +            'ingest_request_source': args.ingest_request_source, +            'ext_ids': { +                args.extid_type: row['identifier'], +            }, +        } +        if args.release_stage: +            assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update') +            request['release_stage'] = args.release_stage + +        print("{}".format(json.dumps(request, sort_keys=True))) + +def main(): +    parser = argparse.ArgumentParser( +        formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument('--link-source', +        required=True, +        help="link_source to include in request") +    parser.add_argument('--extid-type', +        required=True, +        help="extid to encode identifier as") +    parser.add_argument('--ingest-type', +        default="pdf", +        help="ingest type (pdf, html, xml, etc)") +    parser.add_argument('--ingest-request-source', +        default="arabesque", +        help="to include in request") +    parser.add_argument('--release-stage', +        default=None, +        help="to include in request") +    parser.add_argument('json_file', +        help="arabesque output file to use", +        type=argparse.FileType('r')) +    subparsers = parser.add_subparsers() + +    args = parser.parse_args() + +    run(args) + +if __name__ == '__main__': +    main() diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py new file mode 100755 index 0000000..e867b21 --- /dev/null +++ b/python/scripts/cdx_collection.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Fetches and merges all CDX files for a collection. + +Calls metadata API to enumerate all items/files, then fetches and concatanates +them all. Requires the 'internetarchive' library. + +Call with a collection name: + +    ./cdx_collection SOME_COLLECTION_NAME +""" + +import os +import sys +import shutil +import tempfile +import requests +import subprocess +import internetarchive as ia + +def run(): + +    if len(sys.argv) != 2: +        print("Expected a single argument (collection name)") +        sys.exit(-1) + +    collection = sys.argv[1] + +    # Check collection name is clean +    assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum() + +    tempdir = tempfile.mkdtemp() +    print("Looking up collection: {}".format(collection)) + +    # First fetch list +    item_list = list( +        ia.search_items( +            query="collection:{} mediatype:web".format(collection))) + +    if len(item_list) == 0: +        print("No items found, bailing") +        sys.exit(-1) + +    print("Found {} potential items".format(len(item_list))) +    status = True +    errors = [] +    for item in item_list: +        item = item['identifier'] +        # TODO: error handling +        try: +            ret = ia.download(item, files=[item + '.cdx.gz'], +                verbose=True, +                destdir=tempdir, +                no_directory=True, +                retries=1000) +            status = ret and status +        except requests.exceptions.ReadTimeout as rt: +            print(str(rt), file=sys.stderr) +            errors.append(rt) +            continue + +    if errors: +        print("## Download Errors", file=sys.stderr) +        for e in errors: +            print(e, file=sys.stderr) + +    # Combine files +    print("Merging and re-compressing all CDX files...") +    #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir), +    subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), +        shell=True) + +    # Move and cleanup +    shutil.move('{}/combined.gz'.format(tempdir), +                '{}.cdx.gz'.format(collection)) + +    print("Done!") + +if __name__=='__main__': +    run() diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py new file mode 100755 index 0000000..33c425d --- /dev/null +++ b/python/scripts/covid2ingestrequest.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 + +""" +Transform an unpaywall dump (JSON) into ingest requests. +""" + +import sys +import json +import argparse +import urlcanon + + +def canon(s): +    parsed = urlcanon.parse_url(s) +    return str(urlcanon.whatwg(parsed)) + + +def transform_cnki(obj): + +    requests = [] +    assert obj['cnki_id'] + + +    requests = [] +    requests.append({ +        'base_url': canon(obj['info_url']), +        'ingest_type': 'pdf', +        'link_source': 'cnki_covid19', +        'link_source_id': obj['cnki_id'], +        'ingest_request_source': 'scrape-covid19', +    }) +    if 'read_url' in obj: +        requests.append({ +            'base_url': canon(obj['read_url']), +            'ingest_type': 'pdf',  # actually HTML +            'link_source': 'cnki_covid19', +            'link_source_id': obj['cnki_id'], +            'ingest_request_source': 'scrape-covid19', +        }) + +    return requests + +def transform_wanfang(obj): + +    assert obj['wanfang_id'] +    return [{ +        'base_url': canon(obj['url']), +        'ingest_type': 'pdf', +        'link_source': 'wanfang_covid19', +        'link_source_id': obj['wanfang_id'], +        'ingest_request_source': 'scrape-covid19', +    }] + + +def run(args): +    for l in args.json_file: +        if not l.strip(): +            continue +        row = json.loads(l) + +        if 'wanfang_id' in row: +            requests = transform_wanfang(row) or [] +        elif 'cnki_id' in row: +            requests = transform_cnki(row) or [] +        else: +            continue +        for r in requests: +            print("{}".format(json.dumps(r, sort_keys=True))) + +def main(): +    parser = argparse.ArgumentParser( +        formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument('json_file', +        help="COVID-19 metadata file to use", +        type=argparse.FileType('r')) +    subparsers = parser.add_subparsers() + +    args = parser.parse_args() + +    run(args) + +if __name__ == '__main__': +    main() diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py new file mode 100755 index 0000000..86b3b35 --- /dev/null +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +""" +Tool for bulk uploading GROBID TEI-XML output from a local filesystem dump +(from HBase) to AWS S3. + +See unpaywall delivery README (in bnewbold's scratch repo) for notes on running +this script for that specific use-case. + +Script takes: +- input TSV: `sha1_hex, json (including grobid0:tei_xml)` +    => usually from dumpgrobid, with SHA-1 key transformed to hex, and filtered +       down (eg, by join by SHA-1) to a specific manifest +- AWS S3 bucket and prefix + +AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) + +Output: +- errors/stats to stderr +- log to stdout (redirect to file), prefixed by sha1 + +Requires: +- raven (sentry) +- boto3 (AWS S3 client library) +""" + +import os +import sys +import json +import base64 +import hashlib +import argparse +from collections import Counter + +import boto3 +import raven + +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + + +def b32_hex(s): +    """copy/pasta from elsewhere""" +    s = s.strip().split()[0].lower() +    if s.startswith("sha1:"): +        s = s[5:] +    if len(s) != 32: +        return s +    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + + +class DeliverDumpGrobidS3(): + +    def __init__(self, s3_bucket, **kwargs): +        self.rstore = None +        self.count = Counter() +        self.s3_bucket = s3_bucket +        self.s3_prefix = kwargs.get('s3_prefix', 'grobid/') +        self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml') +        self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD') +        self.s3 = boto3.resource('s3') +        self.bucket = self.s3.Bucket(self.s3_bucket) + +    def run(self, dump_file): +        sys.stderr.write("Starting...\n") +        for line in dump_file: +            line = line.strip().split('\t') +            if len(line) != 2: +                self.count['skip-line'] += 1 +                continue +            sha1_hex, grobid_json = line[0], line[1] +            if len(sha1_hex) != 40: +                sha1_hex = b32_hex(sha1_hex) +            assert len(sha1_hex) == 40 +            grobid = json.loads(grobid_json) +            tei_xml = grobid.get('tei_xml') +            if not tei_xml: +                print("{}\tskip empty".format(sha1_hex)) +                self.count['skip-empty'] += 1 +                continue +            tei_xml = tei_xml.encode('utf-8') +            # upload to AWS S3 +            obj = self.bucket.put_object( +                Key="{}{}/{}{}".format( +                    self.s3_prefix, +                    sha1_hex[0:4], +                    sha1_hex, +                    self.s3_suffix), +                Body=tei_xml, +                StorageClass=self.s3_storage_class, +            ) +            print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml))) +            self.count['success-s3'] += 1 +        sys.stderr.write("{}\n".format(self.count)) + +@sentry_client.capture_exceptions +def main(): + +    parser = argparse.ArgumentParser() +    parser.add_argument('--s3-bucket', +                        required=True, +                        type=str, +                        help='AWS S3 bucket to upload into') +    parser.add_argument('--s3-prefix', +                        type=str, +                        default="grobid/", +                        help='key prefix for items created in bucket') +    parser.add_argument('--s3-suffix', +                        type=str, +                        default=".tei.xml", +                        help='file suffix for created objects') +    parser.add_argument('--s3-storage-class', +                        type=str, +                        default="STANDARD", +                        help='AWS S3 storage class (redundancy) to use') +    parser.add_argument('dump_file', +                        help="TSV/JSON dump file", +                        default=sys.stdin, +                        type=argparse.FileType('r')) +    args = parser.parse_args() + +    worker = DeliverDumpGrobidS3(**args.__dict__) +    worker.run(args.dump_file) + +if __name__ == '__main__': # pragma: no cover +    main() diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py new file mode 100755 index 0000000..3dcf962 --- /dev/null +++ b/python/scripts/deliver_gwb_to_disk.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Tool for bulk copying of PDFs (or other files) from GWB to local disk. +""" + +# XXX: some broken MRO thing going on in here due to python3 object wrangling +# in `wayback` library. Means we can't run pylint. +# pylint: skip-file + +import os +import sys +import json +import base64 +import hashlib +import argparse +from collections import Counter + +import raven +import wayback.exception +from http.client import IncompleteRead +from wayback.resourcestore import ResourceStore +from gwb.loader import CDXLoaderFactory + +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + + +class DeliverGwbDisk: + +    def __init__(self, disk_dir, **kwargs): +        self.warc_uri_prefix = kwargs.get('warc_uri_prefix') +        self.rstore = None +        self.count = Counter() +        # /serve/ instead of /download/ doesn't record view count +        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') +        # gwb library will fall back to reading from /opt/.petabox/webdata.secret +        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) +        self.disk_dir = disk_dir +        self.disk_prefix = kwargs.get('disk_prefix', 'pdf/') +        self.disk_suffix = kwargs.get('disk_suffix', '.pdf') + +    def fetch_warc_content(self, warc_path, offset, c_size): +        warc_uri = self.warc_uri_prefix + warc_path +        if not self.rstore: +            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( +                webdata_secret=self.petabox_webdata_secret, +                download_base_url=self.petabox_base_url)) +        try: +            gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) +        except wayback.exception.ResourceUnavailable: +            return None, dict(status="error", +                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") +        except ValueError as ve: +            return None, dict(status="error", +                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) +        except EOFError as eofe: +            return None, dict(status="error", +                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) +        except TypeError as te: +            return None, dict(status="error", +                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) +        # Note: could consider a generic "except Exception" here, as we get so +        # many petabox errors. Do want jobs to fail loud and clear when the +        # whole cluster is down though. + +        if gwb_record.get_status()[0] != 200: +            return None, dict(status="error", +                reason="archived HTTP response (WARC) was not 200", +                warc_status=gwb_record.get_status()[0]) + +        try: +            raw_content = gwb_record.open_raw_content().read() +        except IncompleteRead as ire: +            return None, dict(status="error", +                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) +        return raw_content, None + +    def run(self, manifest_file): +        sys.stderr.write("Ensuring all 65536 base directories exist...\n") +        for i in range(256): +            for j in range(256): +                fpath = "{}/{}{:02x}/{:02x}".format( +                        self.disk_dir, +                        self.disk_prefix, +                        i, +                        j) +                os.makedirs(fpath, exist_ok=True) +        sys.stderr.write("Starting...\n") +        for line in manifest_file: +            self.count['total'] += 1 +            line = line.strip().split('\t') +            if len(line) != 2: +                self.count['skip-line'] += 1 +                continue +            sha1_hex, cdx_json = line[0], line[1] +            assert len(sha1_hex) == 40 +            file_cdx = json.loads(cdx_json) +            # If warc is not item/file.(w)arc.gz form, skip it +            if len(file_cdx['warc'].split('/')) != 2: +                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) +                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) +                self.count['skip-warc'] += 1 +                continue +            # fetch from GWB/petabox via HTTP range-request +            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) +            if blob is None and status: +                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) +                self.count['err-petabox-fetch'] += 1 +                continue +            elif not blob: +                print("{}\tskip-empty-blob".format(sha1_hex)) +                self.count['skip-empty-blob'] += 1 +                continue +            # verify sha1 +            if sha1_hex != hashlib.sha1(blob).hexdigest(): +                #assert sha1_hex == hashlib.sha1(blob).hexdigest() +                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) +                print("{}\terror petabox-hash-mismatch".format(sha1_hex)) +                self.count['err-petabox-hash-mismatch'] += 1 + +            self.count['petabox-ok'] += 1 +            # save to disk +            fpath = "{}/{}{}/{}/{}{}".format( +                    self.disk_dir, +                    self.disk_prefix, +                    sha1_hex[0:2], +                    sha1_hex[2:4], +                    sha1_hex, +                    self.disk_suffix) +            with open(fpath, 'wb') as f: +                f.write(blob) +            print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob))) +            self.count['success-disk'] += 1 +        sys.stderr.write("{}\n".format(self.count)) + +@sentry_client.capture_exceptions +def main(): + +    parser = argparse.ArgumentParser() +    parser.add_argument('--disk-dir', +                        required=True, +                        type=str, +                        help='local base directory to save into') +    parser.add_argument('--disk-prefix', +                        type=str, +                        default="pdf/", +                        help='directory prefix for items created in bucket') +    parser.add_argument('--disk-suffix', +                        type=str, +                        default=".pdf", +                        help='file suffix for created files') +    parser.add_argument('--warc-uri-prefix', +                        type=str, +                        default='https://archive.org/serve/', +                        help='URI where WARCs can be found') +    parser.add_argument('manifest_file', +                        help="TSV/JSON manifest file", +                        default=sys.stdin, +                        type=argparse.FileType('r')) +    args = parser.parse_args() + +    worker = DeliverGwbDisk(**args.__dict__) +    worker.run(args.manifest_file) + +if __name__ == '__main__': # pragma: no cover +    main() diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py new file mode 100755 index 0000000..39ac000 --- /dev/null +++ b/python/scripts/deliver_gwb_to_s3.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Tool for bulk copying of PDFs (or other files) from GWB to AWS S3. + +See unpaywall delivery README (in bnewbold's scratch repo) for notes on running +this script for that specific use-case. + +Script takes: +- input TSV: `sha1_hex, file:cdx (json)` +    => usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest +- AWS S3 bucket and prefix + +AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) + +GWB credentials from environment variable PETABOX_WEBDATA_SECRET, else looks in /opt/.petabox/. + +20x threads on a single machine can process about 340k files in 3 hours; that's +roughly 6 hours per million per host with 32 threads, or 5k files an hour +(1.6/second) per thread. Two large machines should be able to upload 10 million +files in about 30 hours. + +Output: +- errors/stats to stderr +- log to stdout (redirect to file), prefixed by sha1 + +Requires: +- raven (sentry) +- boto3 (AWS S3 client library) +- wayback/GWB libraries +""" + +# XXX: some broken MRO thing going on in here due to python3 object wrangling +# in `wayback` library. Means we can't run pylint. +# pylint: skip-file + +import os +import sys +import json +import base64 +import hashlib +import argparse +from collections import Counter + +import boto3 +import raven +import wayback.exception +from http.client import IncompleteRead +from wayback.resourcestore import ResourceStore +from gwb.loader import CDXLoaderFactory + +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + + +class DeliverGwbS3: + +    def __init__(self, s3_bucket, **kwargs): +        self.warc_uri_prefix = kwargs.get('warc_uri_prefix') +        self.rstore = None +        self.count = Counter() +        # /serve/ instead of /download/ doesn't record view count +        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') +        # gwb library will fall back to reading from /opt/.petabox/webdata.secret +        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) +        self.s3_bucket = s3_bucket +        self.s3_prefix = kwargs.get('s3_prefix', 'pdf/') +        self.s3_suffix = kwargs.get('s3_suffix', '.pdf') +        self.s3 = boto3.resource('s3') +        self.bucket = self.s3.Bucket(self.s3_bucket) + +    def fetch_warc_content(self, warc_path, offset, c_size): +        warc_uri = self.warc_uri_prefix + warc_path +        if not self.rstore: +            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( +                webdata_secret=self.petabox_webdata_secret, +                download_base_url=self.petabox_base_url)) +        try: +            gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) +        except wayback.exception.ResourceUnavailable: +            return None, dict(status="error", +                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") +        except ValueError as ve: +            return None, dict(status="error", +                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) +        except EOFError as eofe: +            return None, dict(status="error", +                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) +        except TypeError as te: +            return None, dict(status="error", +                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) +        # Note: could consider a generic "except Exception" here, as we get so +        # many petabox errors. Do want jobs to fail loud and clear when the +        # whole cluster is down though. + +        if gwb_record.get_status()[0] != 200: +            return None, dict(status="error", +                reason="archived HTTP response (WARC) was not 200", +                warc_status=gwb_record.get_status()[0]) + +        try: +            raw_content = gwb_record.open_raw_content().read() +        except IncompleteRead as ire: +            return None, dict(status="error", +                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) +        return raw_content, None + +    def run(self, manifest_file): +        sys.stderr.write("Starting...\n") +        for line in manifest_file: +            self.count['total'] += 1 +            line = line.strip().split('\t') +            if len(line) != 2: +                self.count['skip-line'] += 1 +                continue +            sha1_hex, cdx_json = line[0], line[1] +            assert len(sha1_hex) == 40 +            file_cdx = json.loads(cdx_json) +            # If warc is not item/file.(w)arc.gz form, skip it +            if len(file_cdx['warc'].split('/')) != 2: +                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) +                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) +                self.count['skip-warc'] += 1 +                continue +            # fetch from GWB/petabox via HTTP range-request +            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) +            if blob is None and status: +                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) +                self.count['err-petabox-fetch'] += 1 +                continue +            elif not blob: +                print("{}\tskip-empty-blob".format(sha1_hex)) +                self.count['skip-empty-blob'] += 1 +                continue +            # verify sha1 +            if sha1_hex != hashlib.sha1(blob).hexdigest(): +                #assert sha1_hex == hashlib.sha1(blob).hexdigest() +                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) +                print("{}\terror petabox-hash-mismatch".format(sha1_hex)) +                self.count['err-petabox-hash-mismatch'] += 1 + +            self.count['petabox-ok'] += 1 +            # upload to AWS S3 +            obj = self.bucket.put_object( +                Key="{}{}/{}{}".format( +                    self.s3_prefix, +                    sha1_hex[0:4], +                    sha1_hex, +                    self.s3_suffix), +                Body=blob) +            print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob))) +            self.count['success-s3'] += 1 +        sys.stderr.write("{}\n".format(self.count)) + +@sentry_client.capture_exceptions +def main(): + +    parser = argparse.ArgumentParser() +    parser.add_argument('--s3-bucket', +                        required=True, +                        type=str, +                        help='AWS S3 bucket to upload into') +    parser.add_argument('--s3-prefix', +                        type=str, +                        default="pdf/", +                        help='key prefix for items created in bucket') +    parser.add_argument('--s3-suffix', +                        type=str, +                        default=".pdf", +                        help='file suffix for created objects') +    parser.add_argument('--warc-uri-prefix', +                        type=str, +                        default='https://archive.org/serve/', +                        help='URI where WARCs can be found') +    parser.add_argument('manifest_file', +                        help="TSV/JSON manifest file", +                        default=sys.stdin, +                        type=argparse.FileType('r')) +    args = parser.parse_args() + +    worker = DeliverGwbS3(**args.__dict__) +    worker.run(args.manifest_file) + +if __name__ == '__main__': # pragma: no cover +    main() diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py new file mode 100755 index 0000000..a7214d0 --- /dev/null +++ b/python/scripts/doaj2ingestrequest.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 + +""" +Transform an DOAJ article dump (JSON) into ingest requests. + +TODO: should we also attempt PDF ingest for HTML links? They seem to often be +landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url` +in the HTML headers and adds an ingest request on that basis. Or even just run +the re-ingest in-process and publish a second result. +""" + +import sys +import json +import argparse +import urlcanon +from typing import Optional, List + +DOMAIN_BLOCKLIST = [ +    # large OA publishers (we get via DOI) + +    # large repos and aggregators (we crawl directly) +    "://arxiv.org/", +    "://europepmc.org/", +    "ncbi.nlm.nih.gov/", +    #"semanticscholar.org/", +    "://doi.org/", +    "zenodo.org/", +    "figshare.com/", +    "://archive.org/", +    ".archive.org/", + +    # large publishers/platforms; may remove in the future +    #"://link.springer.com/", +    #"://dergipark.gov.tr/", +    #"frontiersin.org/", +    #"scielo", +] + +# these default to PDF; note that we also do pdf ingests for HTML pages +CONTENT_TYPE_MAP = { +    "abstract": [], +    "doc": [], +    "": ["pdf"], + +    "doi": ["pdf"], +    "url": ["pdf"], +    "fulltext": ["pdf"], +    "anySimpleType": ["pdf"], + +    "application/pdf": ["pdf"], +    "html": ["html", "pdf"], +    "text/html": ["html", "pdf"], +    "xml": ["xml"], +} + +def canon(s: str) -> str: +    parsed = urlcanon.parse_url(s) +    return str(urlcanon.whatwg(parsed)) + +def transform(obj: dict) -> List[dict]: +    """ +    Transforms from a single DOAJ object to zero or more ingest requests. +    Returns a list of dicts. +    """ + +    doaj_id = obj['id'].lower() +    assert doaj_id + +    bibjson = obj['bibjson'] +    if not bibjson['link']: +        return [] + +    requests = [] + +    doi: Optional[str] = None +    for ident in (bibjson['identifier'] or []): +        if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'): +            doi = ident['id'].lower() + +    for link in (bibjson['link'] or []): +        if link.get('type') != "fulltext" or not link.get('url'): +            continue +        ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower()) +        if not ingest_types: +            continue + +        skip = False +        for domain in DOMAIN_BLOCKLIST: +            if domain in link['url'].lower(): +                skip = True +        if skip: +            continue +        try: +            base_url = canon(link['url'].strip()) +        except UnicodeEncodeError: +            continue + +        if not base_url or len(base_url) > 1000: +            continue + +        for ingest_type in ingest_types: +            request = { +                'base_url': base_url, +                'ingest_type': ingest_type, +                'link_source': 'doaj', +                'link_source_id': doaj_id, +                'ingest_request_source': 'doaj', +                'release_stage': 'published', +                'rel': 'publisher', +                'ext_ids': { +                    'doi': doi, +                    'doaj': doaj_id, +                }, +                'edit_extra': {}, +            } +            requests.append(request) + +    return requests + +def run(args) -> None: +    for l in args.json_file: +        if not l.strip(): +            continue +        row = json.loads(l) + +        requests = transform(row) or [] +        for r in requests: +            print("{}".format(json.dumps(r, sort_keys=True))) + +def main() -> None: +    parser = argparse.ArgumentParser( +        formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument('json_file', +        help="DOAJ article dump file to use", +        type=argparse.FileType('r')) +    subparsers = parser.add_subparsers() + +    args = parser.parse_args() + +    run(args) + +if __name__ == '__main__': +    main() diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py new file mode 100755 index 0000000..9fe1499 --- /dev/null +++ b/python/scripts/enrich_scored_matches.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +""" +Takes an "joined" TSV input stream: + +- sha1 +- dois (JSON list) +- cdx (JSON object) +    - url +    - dt +    (etc) +- mimetype +- size (integer) + +And outputs JSON objects that are can be imported into fatcat with the +"matched" script. + +No dependencies (only python3 stdlib) +""" + +import sys +import json +import base64 + +def run(): +    for line in sys.stdin: +        line = line.split('\t') +        assert len(line) == 5 +        raw_sha1 = line[0].replace('sha1:', '') +        dois = json.loads(line[1]) +        cdx = json.loads(line[2]) +        mimetype = line[3] +        size = int(line[4]) + +        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower() + +        obj = dict( +            sha1=sha1, +            dois=dois, +            cdx=[dict(url=cdx['url'], dt=cdx['dt'])], +            size=size, +            mimetype=mimetype) +        print(json.dumps(obj)) + +if __name__=='__main__': +    run() diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py new file mode 100755 index 0000000..dc4bea7 --- /dev/null +++ b/python/scripts/filter_grobid_metadata.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +import sys +import json + +with open('title_slug_denylist.txt', 'r') as f: +    TITLE_DENYLIST = [l.strip() for l in f] + +TITLE_DENYLIST.extend(( +    'editorial', +    'advertisement', +    'bookreviews', +    'reviews', +    'nr', +    'abstractoriginalarticle', +    'originalarticle', +    'impactfactor', +    'articlenumber', +)) + +# The full name can't *entirely* be one of these +NAME_DENYLIST = ( +    'phd', +    'phdstudent', +) + +def tokenize(s, remove_whitespace=True): + +    s.replace(''', "'") +    # Remove non-alphanumeric characters +    s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()]) + +    if remove_whitespace: +        s = ''.join(s.split()) + +    # Encode as dumb ASCII (TODO: this is horrible) +    return s.encode('ascii', 'replace').decode('utf8').replace('?', '') + +assert tokenize("Impact Factor: 2.114") == "impactfactor" +assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST + +def filter_title(title): + +    title = title.strip() +    if len(title) > 500: +        return None +    title_slug = tokenize(title, remove_whitespace=True) +    if len(title_slug) < 10 or title_slug in TITLE_DENYLIST: +        return None +    if title_slug.startswith('nr'): +        return None +    if title.lower().replace('.', '').startswith('int j '): +        return None + +    for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "): +        if title.startswith(prefix): +            title.replace(prefix, '') + +    if title.startswith("The Journal of "): +        return None + +    if "volume" in title_slug and "issue" in title_slug: +        return None + +    if "downloadedfrom" in title_slug: +        return None + +    if title_slug.startswith("issn"): +        return None + +    # titles with too many or too few words in title +    title_words = len(title.split()) +    if title_words > 50 or title_words < 2: +        return None + +    # titles with spaces between every letter (more than N such single-char words) +    if len([True for w in title.split() if len(w) == 1]) > 12: +        return None + +    # too deep subtitling/splitting +    if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1: +        return None + +    return title + +def filter_author_name(name): +    name = name['name'] +    if name.strip().lower().replace(' ', '') in NAME_DENYLIST: +        return None +    return ' '.join([t for t in name.split() if tokenize(t)]) + +def filter_authors(l): +    return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1] + +def filter_refs(l): +    # TODO: +    return l + +def filter_journal_name(name): +    # same denylist, for now +    if not name: +        return None +    name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') +    slug_name = tokenize(name) +    if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º": +        return None +    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): +        if name.startswith(prefix): +            name = name.replace(prefix, '') +    for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"): +        if name.endswith(suffix): +            name = name.replace(suffix, '') +    if "====================" in name: +        return None +    if len(name) > 150: +        return None +    return ' '.join(name.split()) + +def filter_metadata(obj): +    if not (obj.get('title') and obj.get('authors')): +        return None + +    title = filter_title(obj['title']) +    if not title: +        #sys.stderr.write("bad title\n") +        return None +    else: +        obj['title'] = title +    obj['authors'] = filter_authors(obj['authors']) +    obj['citations'] = filter_refs(obj['citations']) +    obj['journal']['name'] = filter_journal_name(obj['journal']['name']) + +    return obj + +def run(invert=False): +    for line in sys.stdin: +        fields = line.split('\t') +        if len(fields) == 5: +            raw = fields[4] +        elif len(fields) == 1: +            raw = fields[0] +        else: +            sys.stderr.write("bad line\n") +            continue +        obj = json.loads(raw) +        processed = filter_metadata(obj) +        if processed: +            if not invert: +                processed = json.dumps(processed) +                if len(fields) == 5: +                    fields[4] = processed +                else: +                    fields[0] = processed +                print('\t'.join(fields)) +        elif invert: +            print(raw.strip()) + +if __name__=="__main__": +    run(invert="--invert" in sys.argv) diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py new file mode 100755 index 0000000..bbba770 --- /dev/null +++ b/python/scripts/filter_groupworks.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Filters an input stream of sorted "groupworks" scalding job, and outputs +"good enough" matches to be merged in fatcat. + +Output is JSON lines which are arrays of releases that could/should be merged +together, either as multiple releases under a single work, or releases merged +into a single entity (via redirects). + +Note that releases *should* only end up on a single line, and only once per +line! + +No dependencies (only python3 stdlib) + +Note: the actual importer/merger should filter the following patterns out: +- container title has "letter" and "diar" +- contribs (authors) contain "&NA;" +- dates differ (not just year) +""" + +import sys +import json + +# out of 1000 +SCORE_THRESHOLD = 900 + +MAX_SLUG_LINES = 50 + +REQUIRE_AUTHORS = False + +def tokenize(s, remove_whitespace=False): + +    s.replace(''', "'") +    # Remove non-alphanumeric characters +    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) + +    if remove_whitespace: +        s = ''.join(s.split()) + +    # Encode as dumb ASCII (TODO: this is horrible) +    return s.encode('ascii', 'replace').replace(b'?', b'') + +def check_authors(left, right): +    """ +    Intended to check GROBID extracted authors (right) against "known good" +    (but maybe not perfect) Crossref metadata authors ("left"). +    """ +    if not left and not right: +        return bool(not REQUIRE_AUTHORS) +    if len(left) != len(right): +        return False +    right_all = tokenize(" ".join(right)) +    for i in range(len(left)): +        l = left[i].lower().replace('jr.', '').split() +        if not l: +            return False +        l = tokenize(l[-1]) +        if len(l) <= 1: +            # weird author name (single char) +            return False +        if l not in right_all: +            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) +            return False +    return True + +def test_check_authors(): +    assert check_authors([], []) == bool(not REQUIRE_AUTHORS) +    assert not check_authors([], ['one']) +    assert check_authors(['one'], ['one']) +    assert check_authors(['one two'], ['One Two']) +    assert check_authors(['two'], ['One Two']) +    assert check_authors(['two'], ['two, one']) +    assert check_authors(['mago'], ['Mr. Magoo']) +    assert check_authors(['Mr. Magoo'], ['Mr Magoo']) +    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + +# Rows are (score, left, right) +def process_group(rows): + +    # first pass reduces size of list and generates a linkage graph +    filtered = list() +    for row in rows: +        score = int(row[0]) +        if score < SCORE_THRESHOLD: +            continue +        left = json.loads(row[1]) +        right = json.loads(row[2]) +        # authors must roughly match +        if not check_authors(left['authors'], right['authors']): +            continue +        # years must match (if defined) +        if left['year'] and right['year'] and left['year'] != right['year']: +            continue +        filtered.append((left, right)) + +    if not filtered: +        return + +    # second pass finds a connected graph and returns that +    releases = dict() +    group_ids = set() +    for row in filtered[1:]: +        (left, right) = row +        l_id = left['fatcat_release'] +        r_id = right['fatcat_release'] +        releases[l_id] = left +        releases[r_id] = right +        if not group_ids: +            group_ids.add(l_id) +            group_ids.add(r_id) +            continue +        if l_id in group_ids or r_id in group_ids: +            group_ids.add(l_id) +            group_ids.add(r_id) +            continue + +    if not group_ids: +        return + +    print(json.dumps([releases[ident] for ident in group_ids])) + +def run(): + +    last_slug = None +    lines = [] + +    # group lines by slug, and process in batches +    for line in sys.stdin: +        line = line.strip().split('\t') +        assert len(line) == 4 +        slug = line[0] +        if last_slug and slug != last_slug and lines: +            if len(lines) <= MAX_SLUG_LINES: +                process_group(lines) +            lines = [] +        last_slug = slug +        lines.append(line[1:]) + +    # catch any remaining +    if lines: +        process_group(lines) + +if __name__=='__main__': +    run() diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py new file mode 100755 index 0000000..3654b87 --- /dev/null +++ b/python/scripts/filter_scored_matches.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +""" +Filters an input stream of sorted "matchcrossref" scalding job, and outputs +"good enough" matches to be inserted to fatcat. + +Currently works on DOI numbers. Filters for a high enough string match (doesn't +re-do title match), and checks author lists. Filters out slugs with too many +matches, and outputs one-line-per-sha1 (aka, file). + +No dependencies (only python3 stdlib) +""" + +import sys +import json + +# out of 1000 +score_threshold = 900 + +max_slug_lines = 10 + +require_authors = 1 + + +def tokenize(s, remove_whitespace=False): + +    s.replace(''', "'") +    # Remove non-alphanumeric characters +    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) + +    if remove_whitespace: +        s = ''.join(s.split()) + +    # Encode as dumb ASCII (TODO: this is horrible) +    return s.encode('ascii', 'replace').replace(b'?', b'') + +def check_authors(left, right): +    """ +    Intended to check GROBID extracted authors (right) against "known good" +    (but maybe not perfect) Crossref metadata authors ("left"). +    """ +    if not left: +        return False +    if len(left) > len(right): +        return False +    right_all = tokenize(" ".join(right)) +    for i in range(len(left)): +        l = left[i].lower().replace('jr.', '').split() +        if not l: +            return False +        l = tokenize(l[-1]) +        if len(l) <= 1: +            # weird author name (single char) +            return False +        if l not in right_all: +            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) +            return False +    return True + +def test_check_authors(): +    assert not check_authors([], []) +    assert not check_authors([], ['one']) +    assert check_authors(['one'], ['one']) +    assert check_authors(['one two'], ['One Two']) +    assert check_authors(['two'], ['One Two']) +    assert check_authors(['two'], ['two, one']) +    assert check_authors(['mago'], ['Mr. Magoo']) +    assert check_authors(['Mr. Magoo'], ['Mr Magoo']) +    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + +# Rows are (score, grobid, crossref) +def process_group(rows): +    if len(rows) > max_slug_lines: +        return +    keepers = dict() +    for row in rows: +        score = int(row[0]) +        if score < score_threshold: +            continue +        grobid = json.loads(row[1]) +        crossref = json.loads(row[2]) +        if not check_authors(crossref['authors'], grobid['authors']): +            #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) +            continue +        else: +            #print("YES: {} {}".format(crossref['authors'], grobid['authors'])) +            pass +        sha1 = grobid['sha1'] +        doi = crossref['doi'].lower() +        l = keepers.get(sha1, list()) +        l.append(doi) +        keepers[sha1] = l +    for sha1, doi_list in keepers.items(): +        print("{}\t{}".format(sha1, json.dumps(doi_list))) + +def run(): + +    last_slug = None +    lines = [] + +    # group lines by slug, and process in batches +    for line in sys.stdin: +        line = line.strip().split('\t') +        assert len(line) == 4 +        slug = line[0] +        if last_slug and slug != last_slug and lines: +            process_group(lines) +            lines = [] +        last_slug = slug +        lines.append(line[1:]) + +    # catch any remaining +    if lines: +        process_group(lines) + +if __name__=='__main__': +    run() diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py new file mode 100755 index 0000000..79feac1 --- /dev/null +++ b/python/scripts/grobid_affiliations.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +""" +Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction +output, converts the XML to JSON, filters out raw affiliation strings, and +dumps these as JSON subset. + +Run in bulk like: + +    ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations' +""" + +import sys +import json + +from grobid2json import teixml2json + +def parse_hbase(line): +    line = line.split('\t') +    assert len(line) == 2 +    sha1hex = line[0] +    obj = json.loads(line[1]) +    tei_xml = obj['tei_xml'] +    return sha1hex, tei_xml + +def parse_pg(line): +    obj = json.loads(line) +    return obj['sha1hex'], obj['tei_xml'] + +def run(mode='hbase'): +    for line in sys.stdin: +        if mode == 'hbase': +            sha1hex, tei_xml = parse_hbase(line) +        elif mode == 'pg': +            sha1hex, tei_xml = parse_pg(line) +        else: +            raise NotImplementedError('parse mode: {}'.format(mode)) + +        obj = teixml2json(tei_xml, encumbered=False) + +        affiliations = [] +        for author in obj['authors']: +            if author.get('affiliation'): +                affiliations.append(author['affiliation']) +        if affiliations: +            # don't duplicate affiliations; only the unique ones +            affiliations = list(set([json.dumps(a) for a in affiliations])) +            affiliations = [json.loads(a) for a in affiliations] +            print('\t'.join([sha1hex, json.dumps(affiliations)])) + +if __name__=='__main__': +    run() diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py new file mode 100755 index 0000000..d01b526 --- /dev/null +++ b/python/scripts/import_grobid_metadata.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +import sys +import json +import datetime + +MAX_ABSTRACT_BYTES=4096 + +def parse_grobid_json(obj): + +    if not obj.get('title'): +        return None + +    extra = dict() + +    if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: +        abobj = dict( +            mimetype="text/plain", +            language=None, +            content=obj.get('abstract').strip()) +        abstracts = [abobj] +    else: +        abstracts = None + +    contribs = [] +    for a in obj.get('authors', []): +        c = dict(raw_name=a, role="author") +        contribs.append(c) + +    refs = [] +    for raw in obj.get('citations', []): +        extra = dict() +        ref = dict() +        ref['key'] = raw.get('id') +        if raw.get('title'): +            ref['title'] = raw['title'].strip() +        if raw.get('date'): +            try: +                year = int(raw['date'].strip()[:4]) +                ref['year'] = year +            except: +                pass +        for key in ('volume', 'url', 'issue', 'publisher'): +            if raw.get(key): +                extra[key] = raw[key].strip() +        if raw.get('authors'): +            extra['authors'] = [a['name'] for a in raw['authors']] +        if extra: +            extra = dict(grobid=extra) +        else: +            extra = None +        ref['extra'] = extra +        refs.append(ref) + +    release_type = "journal-article" +    release_date = None +    if obj.get('date'): +        # TODO: only returns year, ever? how to handle? +        release_date = datetime.datetime(year=obj['date'], month=1, day=1) + +    if obj.get('doi'): +        extra['doi'] = obj['doi'].lower() +    if obj['journal'].get('name'): +        extra['container_name'] = obj['journal']['name'] + +    extra['is_longtail_oa'] = True + +    # TODO: ISSN/eISSN handling? or just journal name lookup? + +    if extra: +        extra = dict(grobid=extra) +    else: +        extra = None + +    return dict( +        title=obj['title'].strip(), +        contribs=contribs, +        publisher=obj['journal'].get('publisher'), +        volume=obj['journal'].get('volume'), +        issue=obj['journal'].get('issue'), +        abstracts=abstracts, +        release_type=release_type, +        release_date=release_date, +        extra=extra) + +def run(): +    for line in sys.stdin: +        obj = json.loads(line) +        out = parse_grobid_json(obj) +        if out: +            print(out) + +if __name__=="__main__": +    run() diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py new file mode 100755 index 0000000..494ec7a --- /dev/null +++ b/python/scripts/ingestrequest_row2json.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +""" +This script is used to turn ingest request postgres rows (in JSON export +format) back in to regular ingest request JSON. + +The only difference is the name and location of some optional keys. +""" + +import sys +import json +import argparse + + +def transform(row): +    """ +    dict-to-dict +    """ +    row.pop('created', None) +    extra = row.pop('request', None) or {} +    for k in ('ext_ids', 'edit_extra'): +        if k in extra: +            row[k] = extra[k] +    if 'release_ident' in extra: +        row['fatcat'] = dict(release_ident=extra['release_ident']) +    return row + +def run(args): +    for l in args.json_file: +        if not l.strip(): +            continue +        try: +            req = transform(json.loads(l)) +        except: +            print(l, file=sys.stderr) +        print(json.dumps(req, sort_keys=True)) + +def main(): +    parser = argparse.ArgumentParser( +        formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument('json_file', +        help="arabesque output file to use", +        type=argparse.FileType('r')) +    subparsers = parser.add_subparsers() + +    args = parser.parse_args() + +    run(args) + +if __name__ == '__main__': +    main() diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py new file mode 100755 index 0000000..35cee5b --- /dev/null +++ b/python/scripts/manifest_converter.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +""" +Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of +"match" JSON objects which can be imported into fatcat with matched_import.py + +This was used to convert this manifest: + +    https://archive.org/details/ia_papers_manifest_2018-01-25/ + +to JSON format for fast fatcat importing. +""" + +import sys +import json +import sqlite3 + +# iterate over rows in files metadata... +# 1. select all identified DOIs +#   => filter based on count +# 2. select all file metadata +# 3. output object + +def or_none(s): +    if s is None: +        return None +    elif type(s) == str and ((not s) or s == "\\N" or s == "-"): +        return None +    return s + +def process_db(db_path): + +    db = sqlite3.connect(db_path) + +    for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"): +        sha1 = row[0] +        dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall() +        dois = [d[0] for d in dois] +        if not dois: +            continue +        urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall() +        if not urls: +            continue +        cdx = [dict(url=row[0], dt=row[1]) for row in urls] +        obj = dict( +            sha1=sha1, +            mimetype=or_none(row[1]), +            size=(or_none(row[2]) and int(row[2])), +            md5=or_none(row[3]), +            dois=dois, +            cdx=cdx, +        ) +        dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]) +        print(json.dumps(obj)) + +if __name__=="__main__": +    process_db(sys.argv[1]) diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py new file mode 100755 index 0000000..916f41c --- /dev/null +++ b/python/scripts/oai2ingestrequest.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 + +""" +Transform an OAI-PMH bulk dump (JSON) into ingest requests. + +Eg: https://archive.org/details/oai_harvest_20200215 +""" + +import sys +import json +import argparse +import urlcanon + +DOMAIN_BLOCKLIST = [ +    # large OA publishers (we get via DOI) + +    # large repos and aggregators (we crawl directly) +    "://arxiv.org/", +    "://europepmc.org/", +    "ncbi.nlm.nih.gov/", +    "semanticscholar.org/", +    "://doi.org/", +    "://dx.doi.org/", +    "zenodo.org/", +    "figshare.com/", +    "://archive.org/", +    ".archive.org/", +    "://127.0.0.1/", + +    # OAI specific additions +    "://hdl.handle.net/", +] + +RELEASE_STAGE_MAP = { +    'info:eu-repo/semantics/draftVersion':     'draft', +    'info:eu-repo/semantics/submittedVersion': 'submitted', +    'info:eu-repo/semantics/acceptedVersion':  'accepted', +    'info:eu-repo/semantics/publishedVersion': 'published', +    'info:eu-repo/semantics/updatedVersion':   'updated', +} + +def canon(s): +    parsed = urlcanon.parse_url(s) +    return str(urlcanon.whatwg(parsed)) + +def transform(obj): +    """ +    Transforms from a single OAI-PMH object to zero or more ingest requests. +    Returns a list of dicts. +    """ + +    requests = [] +    if not obj.get('oai') or not obj['oai'].startswith('oai:'): +        return [] +    if not obj.get('urls'): +        return [] + +    # look in obj['formats'] for PDF? +    if obj.get('formats'): +        # if there is a list of formats, and it does not contain PDF, then +        # skip. Note that we will continue if there is no formats list. +        has_pdf = False +        for f in obj['formats']: +            if 'pdf' in f.lower(): +                has_pdf = True +        if not has_pdf: +            return [] + +    doi = None +    if obj.get('doi'): +        doi = obj['doi'][0].lower().strip() +        if not doi.startswith('10.'): +            doi = None + +    # infer release stage and/or type from obj['types'] +    release_stage = None +    for t in obj.get('types', []): +        if t in RELEASE_STAGE_MAP: +            release_stage = RELEASE_STAGE_MAP[t] + +    # TODO: infer rel somehow? Eg, repository vs. OJS publisher +    rel = None + +    for url in obj['urls']: +        skip = False +        for domain in DOMAIN_BLOCKLIST: +            if domain in url: +                skip = True +        if skip: +            continue +        try: +            base_url = canon(url) +        except UnicodeEncodeError: +            continue + +        request = { +            'base_url': base_url, +            'ingest_type': 'pdf', +            'link_source': 'oai', +            'link_source_id': obj['oai'].lower(), +            'ingest_request_source': 'metha-bulk', +            'release_stage': release_stage, +            'rel': rel, +            'ext_ids': { +                'doi': doi, +                'oai': obj['oai'].lower(), +            }, +            'edit_extra': {}, +        } +        requests.append(request) + +    return requests + +def run(args): +    for l in args.json_file: +        if not l.strip(): +            continue +        row = json.loads(l) + +        requests = transform(row) or [] +        for r in requests: +            print("{}".format(json.dumps(r, sort_keys=True))) + +def main(): +    parser = argparse.ArgumentParser( +        formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument('json_file', +        help="OAI-PMH dump file to use (usually stdin)", +        type=argparse.FileType('r')) +    subparsers = parser.add_subparsers() + +    args = parser.parse_args() + +    run(args) + +if __name__ == '__main__': +    main() diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py new file mode 100755 index 0000000..af08db6 --- /dev/null +++ b/python/scripts/pdf_thumbnail.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +""" +Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc). + +Originally used to benchmark and compare file size/quality. +""" + +import sys +import poppler +from PIL import Image + + +def run(inpath, outpath): + +    try: +        pdf = poppler.load_from_file(inpath) +        page = pdf.create_page(0) +    except Exception as e: +        print(str(e), file=sys.stderr) +        sys.exit(0) + +    renderer = poppler.PageRenderer() +    full_page = renderer.render_page(page) +    img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1) +    img.thumbnail((180,300), Image.BICUBIC) +    #img.thumbnail((360,600), Image.BICUBIC) +    img.save(outpath) +    #img.save(outpath, quality=95) + +if __name__ == '__main__': +    if len(sys.argv) != 3: +        print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr) +        sys.exit(-1) +    run(sys.argv[1], sys.argv[2]) diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py new file mode 100755 index 0000000..5536e6c --- /dev/null +++ b/python/scripts/unpaywall2ingestrequest.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +""" +Transform an unpaywall dump (JSON) into ingest requests. +""" + +import sys +import json +import argparse +import urlcanon + +DOMAIN_BLOCKLIST = [ +    # large OA publishers (we get via DOI) + +    # large repos and aggregators (we crawl directly) +    "://arxiv.org/", +    "://europepmc.org/", +    "ncbi.nlm.nih.gov/", +    "semanticscholar.org/", +    "://doi.org/", +    "zenodo.org/", +    "figshare.com/", +    "://archive.org/", +    ".archive.org/", +] + +RELEASE_STAGE_MAP = { +    'draftVersion':     'draft', +    'submittedVersion': 'submitted', +    'acceptedVersion':  'accepted', +    'publishedVersion': 'published', +    'updatedVersion':   'updated', +} + +def canon(s): +    parsed = urlcanon.parse_url(s) +    return str(urlcanon.whatwg(parsed)) + +def transform(obj): +    """ +    Transforms from a single unpaywall object to zero or more ingest requests. +    Returns a list of dicts. +    """ + +    requests = [] +    if not obj['doi'].startswith('10.'): +        return requests +    if not obj['oa_locations']: +        return requests + +    for location in obj['oa_locations']: +        if not location['url_for_pdf']: +            continue +        skip = False +        for domain in DOMAIN_BLOCKLIST: +            if domain in location['url_for_pdf']: +                skip = True +        if skip: +            continue +        try: +            base_url = canon(location['url_for_pdf']) +        except UnicodeEncodeError: +            continue + +        request = { +            'base_url': base_url, +            'ingest_type': 'pdf', +            'link_source': 'unpaywall', +            'link_source_id': obj['doi'].lower(), +            'ingest_request_source': 'unpaywall', +            'release_stage': RELEASE_STAGE_MAP.get(location['version']), +            'rel': location['host_type'], +            'ext_ids': { +                'doi': obj['doi'].lower(), +            }, +            'edit_extra': {}, +        } +        if obj.get('oa_status'): +            request['edit_extra']['oa_status'] = obj['oa_status'] +        if location.get('evidence'): +            request['edit_extra']['evidence'] = location['evidence'] +        if location['pmh_id']: +            request['ext_ids']['pmh_id'] = location['pmh_id'] +        requests.append(request) + +    return requests + +def run(args): +    for l in args.json_file: +        if not l.strip(): +            continue +        row = json.loads(l) + +        requests = transform(row) or [] +        for r in requests: +            print("{}".format(json.dumps(r, sort_keys=True))) + +def main(): +    parser = argparse.ArgumentParser( +        formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument('json_file', +        help="unpaywall dump file to use", +        type=argparse.FileType('r')) +    subparsers = parser.add_subparsers() + +    args = parser.parse_args() + +    run(args) + +if __name__ == '__main__': +    main() | 
