diff options
Diffstat (limited to 'python/scripts')
20 files changed, 1222 insertions, 526 deletions
| diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index 03a1f29..4561541 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -1,5 +1,4 @@  #!/usr/bin/env python3 -  """  This script is intended to be used for backfill ingest of old crawls. It can  also be used as a fast path for getting freshly crawled content into fatcat if @@ -12,9 +11,9 @@ Run like:  Can then run through requests using that tool, or dump into kafka queue.  """ -import sys -import json  import argparse +import json +import sys  def run(args): @@ -22,51 +21,54 @@ def run(args):          if not l.strip():              continue          row = json.loads(l) -        if not row['hit']: +        if not row["hit"]:              continue          request = { -            'base_url': row['final_url'], -            'ingest_type': args.ingest_type, -            'link_source': args.link_source, -            'link_source_id': row['identifier'], -            'ingest_request_source': args.ingest_request_source, -            'ext_ids': { -                args.extid_type: row['identifier'], +            "base_url": row["final_url"], +            "ingest_type": args.ingest_type, +            "link_source": args.link_source, +            "link_source_id": row["identifier"], +            "ingest_request_source": args.ingest_request_source, +            "ext_ids": { +                args.extid_type: row["identifier"],              },          }          if args.release_stage: -            assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update') -            request['release_stage'] = args.release_stage +            assert args.release_stage in ( +                "published", +                "submitted", +                "accepted", +                "draft", +                "update", +            ) +            request["release_stage"] = args.release_stage          print("{}".format(json.dumps(request, sort_keys=True))) +  def main(): -    parser = argparse.ArgumentParser( -        formatter_class=argparse.ArgumentDefaultsHelpFormatter) -    parser.add_argument('--link-source', -        required=True, -        help="link_source to include in request") -    parser.add_argument('--extid-type', -        required=True, -        help="extid to encode identifier as") -    parser.add_argument('--ingest-type', -        default="pdf", -        help="ingest type (pdf, html, xml, etc)") -    parser.add_argument('--ingest-request-source', -        default="arabesque", -        help="to include in request") -    parser.add_argument('--release-stage', -        default=None, -        help="to include in request") -    parser.add_argument('json_file', -        help="arabesque output file to use", -        type=argparse.FileType('r')) +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "--link-source", required=True, help="link_source to include in request" +    ) +    parser.add_argument("--extid-type", required=True, help="extid to encode identifier as") +    parser.add_argument( +        "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)" +    ) +    parser.add_argument( +        "--ingest-request-source", default="arabesque", help="to include in request" +    ) +    parser.add_argument("--release-stage", default=None, help="to include in request") +    parser.add_argument( +        "json_file", help="arabesque output file to use", type=argparse.FileType("r") +    )      subparsers = parser.add_subparsers()      args = parser.parse_args()      run(args) -if __name__ == '__main__': + +if __name__ == "__main__":      main() diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py new file mode 100755 index 0000000..6328f52 --- /dev/null +++ b/python/scripts/archiveorg_fileset.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Helper script to  + +Takes either two args (release ident and archive.org item), or a stream of +tab-separated such pairs on stdin. + +TODO: +- should this check the item type? +""" + +import json +import sys +from typing import Any + +import internetarchive + +FORMAT_TO_MIMETYPE = { +    "BZIP": "application/x-bzip", +    "BZIP2": "application/x-bzip2", +    "ZIP": "application/zip", +    "GZIP": "application/gzip", +    "RAR": "application/vnd.rar", +    "TAR": "application/x-tar", +    "7z": "application/x-7z-compressed", +    "HTML": "text/html", +    "Text": "text/plain", +    "PDF": "application/pdf", +    "CSV": "text/csv", +    "XML": "application/xml", +    "JSON": "application/json", +    #'application/msword (.doc)', # .doc +    #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx +    #'application/vnd.ms-excel', # .xls +    #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx +    "MP3": "audio/mpeg",  # .mp3 +    "MP4": "video/mp4",  # .mp4 +    "MPEG": "video/mpeg",  # .mpeg +    "JPEG": "image/jpeg", +    "GIF": "image/gif", +    "PNG": "image/png", +    "TIFF": "image/tiff", +    "Unknown": None, +} + + +def want_file(f: dict, item_name: str) -> bool: +    """ +    Filters IA API files +    """ +    if f.source != "original": +        return False +    for suffix in [ +        "_meta.sqlite", +        "_archive.torrent", +        "_itemimage.jpg", +        "_meta.xml", +        "_thumb.png", +        "_files.xml", +    ]: +        if f.name == item_name + suffix or f.name == item_name.lower() + suffix: +            return False +    if f.name.startswith("_"): +        return False +    if item_name.startswith("academictorrents_"): +        for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]: +            if f.name == item_name + suffix: +                return False +    return True + + +def parse_file(f: dict) -> dict: +    """ +    Takes an IA API file and turns it in to a fatcat fileset manifest file +    """ +    assert f.name and f.sha1 and f.md5 +    assert f.name is not None +    mf = { +        "path": f.name, +        "size": int(f.size), +        "sha1": f.sha1, +        "md5": f.md5, +    } +    # TODO: will disable this hard check eventually and replace with: +    # mimetype = FORMAT_TO_MIMETYPE.get(f.format) +    mimetype = FORMAT_TO_MIMETYPE[f.format] +    if mimetype: +        mf["extra"] = dict(mimetype=mimetype) +    return mf + + +def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession): +    print(f"processing item={item_name} release_id={release_id}", file=sys.stderr) +    if release_id.startswith("release_"): +        release_id = release_id[9:] +    assert len(release_id) == 26 +    item = session.get_item(item_name) +    assert item.metadata["mediatype"] not in ["collection", "web"] +    item_files = item.get_files(on_the_fly=False) +    manifest = [parse_file(f) for f in item_files if want_file(f, item_name)] +    fileset = { +        "manifest": manifest, +        "urls": [ +            { +                "rel": "archive", +                "url": f"https://archive.org/download/{item_name}/", +            }, +        ], +        "release_ids": [release_id], +        # extra={}, +    } +    print(json.dumps(fileset)) +    return fileset + + +def main(): +    session = internetarchive.get_session() +    if len(sys.argv) == 3: +        item_name = sys.argv[1] +        release_id = sys.argv[2] +        item_to_fileset(item_name, release_id=release_id, session=session) +    else: +        for line in sys.stdin: +            line = line.strip() +            if not line: +                continue +            fields = line.split("\t") +            assert len(fields) == 2 +            item_name = fields[0] +            release_id = fields[1] +            item_to_fileset(item_name, release_id=release_id, session=session) + + +if __name__ == "__main__": +    main() diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py new file mode 100755 index 0000000..0b60da3 --- /dev/null +++ b/python/scripts/cdx_collection.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +Fetches and merges all CDX files for a collection. + +Calls metadata API to enumerate all items/files, then fetches and concatanates +them all. Requires the 'internetarchive' library. + +Call with a collection name: + +    ./cdx_collection SOME_COLLECTION_NAME +""" + +import os +import shutil +import subprocess +import sys +import tempfile + +import internetarchive as ia +import requests + + +def run(): + +    if len(sys.argv) != 2: +        print("Expected a single argument (collection name)") +        sys.exit(-1) + +    collection = sys.argv[1] + +    # Check collection name is clean +    assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum() + +    tempdir = tempfile.mkdtemp() +    print("Looking up collection: {}".format(collection)) + +    # First fetch list +    item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection))) + +    if len(item_list) == 0: +        print("No items found, bailing") +        sys.exit(-1) + +    print("Found {} potential items".format(len(item_list))) +    status = True +    errors = [] +    for item in item_list: +        item = item["identifier"] +        # TODO: error handling +        try: +            ret = ia.download( +                item, +                files=[item + ".cdx.gz"], +                verbose=True, +                destdir=tempdir, +                no_directory=True, +                retries=1000, +            ) +            status = ret and status +        except requests.exceptions.ReadTimeout as rt: +            print(str(rt), file=sys.stderr) +            errors.append(rt) +            continue + +    if errors: +        print("## Download Errors", file=sys.stderr) +        for e in errors: +            print(e, file=sys.stderr) + +    # Combine files +    print("Merging and re-compressing all CDX files...") +    # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir), +    subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True) + +    # Move and cleanup +    shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection)) + +    print("Done!") + + +if __name__ == "__main__": +    run() diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py index 33c425d..e3bf4f0 100755 --- a/python/scripts/covid2ingestrequest.py +++ b/python/scripts/covid2ingestrequest.py @@ -1,12 +1,12 @@  #!/usr/bin/env python3 -  """  Transform an unpaywall dump (JSON) into ingest requests.  """ -import sys -import json  import argparse +import json +import sys +  import urlcanon @@ -18,38 +18,44 @@ def canon(s):  def transform_cnki(obj):      requests = [] -    assert obj['cnki_id'] - +    assert obj["cnki_id"]      requests = [] -    requests.append({ -        'base_url': canon(obj['info_url']), -        'ingest_type': 'pdf', -        'link_source': 'cnki_covid19', -        'link_source_id': obj['cnki_id'], -        'ingest_request_source': 'scrape-covid19', -    }) -    if 'read_url' in obj: -        requests.append({ -            'base_url': canon(obj['read_url']), -            'ingest_type': 'pdf',  # actually HTML -            'link_source': 'cnki_covid19', -            'link_source_id': obj['cnki_id'], -            'ingest_request_source': 'scrape-covid19', -        }) +    requests.append( +        { +            "base_url": canon(obj["info_url"]), +            "ingest_type": "pdf", +            "link_source": "cnki_covid19", +            "link_source_id": obj["cnki_id"], +            "ingest_request_source": "scrape-covid19", +        } +    ) +    if "read_url" in obj: +        requests.append( +            { +                "base_url": canon(obj["read_url"]), +                "ingest_type": "pdf",  # actually HTML +                "link_source": "cnki_covid19", +                "link_source_id": obj["cnki_id"], +                "ingest_request_source": "scrape-covid19", +            } +        )      return requests +  def transform_wanfang(obj): -    assert obj['wanfang_id'] -    return [{ -        'base_url': canon(obj['url']), -        'ingest_type': 'pdf', -        'link_source': 'wanfang_covid19', -        'link_source_id': obj['wanfang_id'], -        'ingest_request_source': 'scrape-covid19', -    }] +    assert obj["wanfang_id"] +    return [ +        { +            "base_url": canon(obj["url"]), +            "ingest_type": "pdf", +            "link_source": "wanfang_covid19", +            "link_source_id": obj["wanfang_id"], +            "ingest_request_source": "scrape-covid19", +        } +    ]  def run(args): @@ -58,26 +64,27 @@ def run(args):              continue          row = json.loads(l) -        if 'wanfang_id' in row: +        if "wanfang_id" in row:              requests = transform_wanfang(row) or [] -        elif 'cnki_id' in row: +        elif "cnki_id" in row:              requests = transform_cnki(row) or []          else:              continue          for r in requests:              print("{}".format(json.dumps(r, sort_keys=True))) +  def main(): -    parser = argparse.ArgumentParser( -        formatter_class=argparse.ArgumentDefaultsHelpFormatter) -    parser.add_argument('json_file', -        help="COVID-19 metadata file to use", -        type=argparse.FileType('r')) +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r") +    )      subparsers = parser.add_subparsers()      args = parser.parse_args()      run(args) -if __name__ == '__main__': + +if __name__ == "__main__":      main() diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py index 86b3b35..27ccf21 100755 --- a/python/scripts/deliver_dumpgrobid_to_s3.py +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -19,23 +19,20 @@ Output:  - log to stdout (redirect to file), prefixed by sha1  Requires: -- raven (sentry) +- sentry-sdk  - boto3 (AWS S3 client library)  """ -import os -import sys -import json +import argparse  import base64  import hashlib -import argparse +import json +import os +import sys  from collections import Counter  import boto3 -import raven - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() +import sentry_sdk  def b32_hex(s): @@ -45,81 +42,80 @@ def b32_hex(s):          s = s[5:]      if len(s) != 32:          return s -    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') - +    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") -class DeliverDumpGrobidS3(): +class DeliverDumpGrobidS3:      def __init__(self, s3_bucket, **kwargs):          self.rstore = None          self.count = Counter()          self.s3_bucket = s3_bucket -        self.s3_prefix = kwargs.get('s3_prefix', 'grobid/') -        self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml') -        self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD') -        self.s3 = boto3.resource('s3') +        self.s3_prefix = kwargs.get("s3_prefix", "grobid/") +        self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml") +        self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD") +        self.s3 = boto3.resource("s3")          self.bucket = self.s3.Bucket(self.s3_bucket)      def run(self, dump_file):          sys.stderr.write("Starting...\n")          for line in dump_file: -            line = line.strip().split('\t') +            line = line.strip().split("\t")              if len(line) != 2: -                self.count['skip-line'] += 1 +                self.count["skip-line"] += 1                  continue              sha1_hex, grobid_json = line[0], line[1]              if len(sha1_hex) != 40:                  sha1_hex = b32_hex(sha1_hex)              assert len(sha1_hex) == 40              grobid = json.loads(grobid_json) -            tei_xml = grobid.get('tei_xml') +            tei_xml = grobid.get("tei_xml")              if not tei_xml:                  print("{}\tskip empty".format(sha1_hex)) -                self.count['skip-empty'] += 1 +                self.count["skip-empty"] += 1                  continue -            tei_xml = tei_xml.encode('utf-8') +            tei_xml = tei_xml.encode("utf-8")              # upload to AWS S3              obj = self.bucket.put_object( -                Key="{}{}/{}{}".format( -                    self.s3_prefix, -                    sha1_hex[0:4], -                    sha1_hex, -                    self.s3_suffix), +                Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),                  Body=tei_xml,                  StorageClass=self.s3_storage_class,              )              print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml))) -            self.count['success-s3'] += 1 +            self.count["success-s3"] += 1          sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions +  def main():      parser = argparse.ArgumentParser() -    parser.add_argument('--s3-bucket', -                        required=True, -                        type=str, -                        help='AWS S3 bucket to upload into') -    parser.add_argument('--s3-prefix', -                        type=str, -                        default="grobid/", -                        help='key prefix for items created in bucket') -    parser.add_argument('--s3-suffix', -                        type=str, -                        default=".tei.xml", -                        help='file suffix for created objects') -    parser.add_argument('--s3-storage-class', -                        type=str, -                        default="STANDARD", -                        help='AWS S3 storage class (redundancy) to use') -    parser.add_argument('dump_file', -                        help="TSV/JSON dump file", -                        default=sys.stdin, -                        type=argparse.FileType('r')) +    parser.add_argument( +        "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into" +    ) +    parser.add_argument( +        "--s3-prefix", +        type=str, +        default="grobid/", +        help="key prefix for items created in bucket", +    ) +    parser.add_argument( +        "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects" +    ) +    parser.add_argument( +        "--s3-storage-class", +        type=str, +        default="STANDARD", +        help="AWS S3 storage class (redundancy) to use", +    ) +    parser.add_argument( +        "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r") +    )      args = parser.parse_args() +    sentry_sdk.init() +      worker = DeliverDumpGrobidS3(**args.__dict__)      worker.run(args.dump_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == "__main__":  # pragma: no cover      main() diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py index 3dcf962..093f32a 100755 --- a/python/scripts/deliver_gwb_to_disk.py +++ b/python/scripts/deliver_gwb_to_disk.py @@ -7,160 +7,191 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.  # in `wayback` library. Means we can't run pylint.  # pylint: skip-file -import os -import sys -import json +import argparse  import base64  import hashlib -import argparse +import json +import os +import sys  from collections import Counter +from http.client import IncompleteRead -import raven +import sentry_sdk  import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore  from gwb.loader import CDXLoaderFactory - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() +from wayback.resourcestore import ResourceStore  class DeliverGwbDisk: -      def __init__(self, disk_dir, **kwargs): -        self.warc_uri_prefix = kwargs.get('warc_uri_prefix') +        self.warc_uri_prefix = kwargs.get("warc_uri_prefix")          self.rstore = None          self.count = Counter()          # /serve/ instead of /download/ doesn't record view count -        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') +        self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")          # gwb library will fall back to reading from /opt/.petabox/webdata.secret -        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) +        self.petabox_webdata_secret = kwargs.get( +            "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET") +        )          self.disk_dir = disk_dir -        self.disk_prefix = kwargs.get('disk_prefix', 'pdf/') -        self.disk_suffix = kwargs.get('disk_suffix', '.pdf') +        self.disk_prefix = kwargs.get("disk_prefix", "pdf/") +        self.disk_suffix = kwargs.get("disk_suffix", ".pdf")      def fetch_warc_content(self, warc_path, offset, c_size):          warc_uri = self.warc_uri_prefix + warc_path          if not self.rstore: -            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( -                webdata_secret=self.petabox_webdata_secret, -                download_base_url=self.petabox_base_url)) +            self.rstore = ResourceStore( +                loaderfactory=CDXLoaderFactory( +                    webdata_secret=self.petabox_webdata_secret, +                    download_base_url=self.petabox_base_url, +                ) +            )          try:              gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)          except wayback.exception.ResourceUnavailable: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)", +            )          except ValueError as ve: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (ValueError: {})".format( +                    ve +                ), +            )          except EOFError as eofe: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (EOFError: {})".format( +                    eofe +                ), +            )          except TypeError as te: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format( +                    te +                ), +            )          # Note: could consider a generic "except Exception" here, as we get so          # many petabox errors. Do want jobs to fail loud and clear when the          # whole cluster is down though.          if gwb_record.get_status()[0] != 200: -            return None, dict(status="error", +            return None, dict( +                status="error",                  reason="archived HTTP response (WARC) was not 200", -                warc_status=gwb_record.get_status()[0]) +                warc_status=gwb_record.get_status()[0], +            )          try:              raw_content = gwb_record.open_raw_content().read()          except IncompleteRead as ire: -            return None, dict(status="error", -                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) +            return None, dict( +                status="error", +                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format( +                    ire +                ), +            )          return raw_content, None      def run(self, manifest_file):          sys.stderr.write("Ensuring all 65536 base directories exist...\n")          for i in range(256):              for j in range(256): -                fpath = "{}/{}{:02x}/{:02x}".format( -                        self.disk_dir, -                        self.disk_prefix, -                        i, -                        j) +                fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j)                  os.makedirs(fpath, exist_ok=True)          sys.stderr.write("Starting...\n")          for line in manifest_file: -            self.count['total'] += 1 -            line = line.strip().split('\t') +            self.count["total"] += 1 +            line = line.strip().split("\t")              if len(line) != 2: -                self.count['skip-line'] += 1 +                self.count["skip-line"] += 1                  continue              sha1_hex, cdx_json = line[0], line[1]              assert len(sha1_hex) == 40              file_cdx = json.loads(cdx_json)              # If warc is not item/file.(w)arc.gz form, skip it -            if len(file_cdx['warc'].split('/')) != 2: -                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) -                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) -                self.count['skip-warc'] += 1 +            if len(file_cdx["warc"].split("/")) != 2: +                sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"])) +                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"])) +                self.count["skip-warc"] += 1                  continue              # fetch from GWB/petabox via HTTP range-request -            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) +            blob, status = self.fetch_warc_content( +                file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"] +            )              if blob is None and status: -                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) -                self.count['err-petabox-fetch'] += 1 +                print( +                    "{}\terror petabox\t{}\t{}".format( +                        sha1_hex, file_cdx["warc"], status["reason"] +                    ) +                ) +                self.count["err-petabox-fetch"] += 1                  continue              elif not blob:                  print("{}\tskip-empty-blob".format(sha1_hex)) -                self.count['skip-empty-blob'] += 1 +                self.count["skip-empty-blob"] += 1                  continue              # verify sha1              if sha1_hex != hashlib.sha1(blob).hexdigest(): -                #assert sha1_hex == hashlib.sha1(blob).hexdigest() -                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) +                # assert sha1_hex == hashlib.sha1(blob).hexdigest() +                # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))                  print("{}\terror petabox-hash-mismatch".format(sha1_hex)) -                self.count['err-petabox-hash-mismatch'] += 1 +                self.count["err-petabox-hash-mismatch"] += 1 -            self.count['petabox-ok'] += 1 +            self.count["petabox-ok"] += 1              # save to disk              fpath = "{}/{}{}/{}/{}{}".format( -                    self.disk_dir, -                    self.disk_prefix, -                    sha1_hex[0:2], -                    sha1_hex[2:4], -                    sha1_hex, -                    self.disk_suffix) -            with open(fpath, 'wb') as f: +                self.disk_dir, +                self.disk_prefix, +                sha1_hex[0:2], +                sha1_hex[2:4], +                sha1_hex, +                self.disk_suffix, +            ) +            with open(fpath, "wb") as f:                  f.write(blob)              print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob))) -            self.count['success-disk'] += 1 +            self.count["success-disk"] += 1          sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions +  def main():      parser = argparse.ArgumentParser() -    parser.add_argument('--disk-dir', -                        required=True, -                        type=str, -                        help='local base directory to save into') -    parser.add_argument('--disk-prefix', -                        type=str, -                        default="pdf/", -                        help='directory prefix for items created in bucket') -    parser.add_argument('--disk-suffix', -                        type=str, -                        default=".pdf", -                        help='file suffix for created files') -    parser.add_argument('--warc-uri-prefix', -                        type=str, -                        default='https://archive.org/serve/', -                        help='URI where WARCs can be found') -    parser.add_argument('manifest_file', -                        help="TSV/JSON manifest file", -                        default=sys.stdin, -                        type=argparse.FileType('r')) +    parser.add_argument( +        "--disk-dir", required=True, type=str, help="local base directory to save into" +    ) +    parser.add_argument( +        "--disk-prefix", +        type=str, +        default="pdf/", +        help="directory prefix for items created in bucket", +    ) +    parser.add_argument( +        "--disk-suffix", type=str, default=".pdf", help="file suffix for created files" +    ) +    parser.add_argument( +        "--warc-uri-prefix", +        type=str, +        default="https://archive.org/serve/", +        help="URI where WARCs can be found", +    ) +    parser.add_argument( +        "manifest_file", +        help="TSV/JSON manifest file", +        default=sys.stdin, +        type=argparse.FileType("r"), +    )      args = parser.parse_args() +    sentry_sdk.init() +      worker = DeliverGwbDisk(**args.__dict__)      worker.run(args.manifest_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == "__main__":  # pragma: no cover      main() diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py index 39ac000..6f37ede 100755 --- a/python/scripts/deliver_gwb_to_s3.py +++ b/python/scripts/deliver_gwb_to_s3.py @@ -24,7 +24,7 @@ Output:  - log to stdout (redirect to file), prefixed by sha1  Requires: -- raven (sentry) +- sentry-sdk  - boto3 (AWS S3 client library)  - wayback/GWB libraries  """ @@ -33,152 +33,180 @@ Requires:  # in `wayback` library. Means we can't run pylint.  # pylint: skip-file -import os -import sys -import json +import argparse  import base64  import hashlib -import argparse +import json +import os +import sys  from collections import Counter +from http.client import IncompleteRead  import boto3 -import raven +import sentry_sdk  import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore  from gwb.loader import CDXLoaderFactory - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() +from wayback.resourcestore import ResourceStore  class DeliverGwbS3: -      def __init__(self, s3_bucket, **kwargs): -        self.warc_uri_prefix = kwargs.get('warc_uri_prefix') +        self.warc_uri_prefix = kwargs.get("warc_uri_prefix")          self.rstore = None          self.count = Counter()          # /serve/ instead of /download/ doesn't record view count -        self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') +        self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")          # gwb library will fall back to reading from /opt/.petabox/webdata.secret -        self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) +        self.petabox_webdata_secret = kwargs.get( +            "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET") +        )          self.s3_bucket = s3_bucket -        self.s3_prefix = kwargs.get('s3_prefix', 'pdf/') -        self.s3_suffix = kwargs.get('s3_suffix', '.pdf') -        self.s3 = boto3.resource('s3') +        self.s3_prefix = kwargs.get("s3_prefix", "pdf/") +        self.s3_suffix = kwargs.get("s3_suffix", ".pdf") +        self.s3 = boto3.resource("s3")          self.bucket = self.s3.Bucket(self.s3_bucket)      def fetch_warc_content(self, warc_path, offset, c_size):          warc_uri = self.warc_uri_prefix + warc_path          if not self.rstore: -            self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( -                webdata_secret=self.petabox_webdata_secret, -                download_base_url=self.petabox_base_url)) +            self.rstore = ResourceStore( +                loaderfactory=CDXLoaderFactory( +                    webdata_secret=self.petabox_webdata_secret, +                    download_base_url=self.petabox_base_url, +                ) +            )          try:              gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)          except wayback.exception.ResourceUnavailable: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (ResourceUnavailable)", +            )          except ValueError as ve: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (ValueError: {})".format( +                    ve +                ), +            )          except EOFError as eofe: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (EOFError: {})".format( +                    eofe +                ), +            )          except TypeError as te: -            return None, dict(status="error", -                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) +            return None, dict( +                status="error", +                reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format( +                    te +                ), +            )          # Note: could consider a generic "except Exception" here, as we get so          # many petabox errors. Do want jobs to fail loud and clear when the          # whole cluster is down though.          if gwb_record.get_status()[0] != 200: -            return None, dict(status="error", +            return None, dict( +                status="error",                  reason="archived HTTP response (WARC) was not 200", -                warc_status=gwb_record.get_status()[0]) +                warc_status=gwb_record.get_status()[0], +            )          try:              raw_content = gwb_record.open_raw_content().read()          except IncompleteRead as ire: -            return None, dict(status="error", -                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) +            return None, dict( +                status="error", +                reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format( +                    ire +                ), +            )          return raw_content, None      def run(self, manifest_file):          sys.stderr.write("Starting...\n")          for line in manifest_file: -            self.count['total'] += 1 -            line = line.strip().split('\t') +            self.count["total"] += 1 +            line = line.strip().split("\t")              if len(line) != 2: -                self.count['skip-line'] += 1 +                self.count["skip-line"] += 1                  continue              sha1_hex, cdx_json = line[0], line[1]              assert len(sha1_hex) == 40              file_cdx = json.loads(cdx_json)              # If warc is not item/file.(w)arc.gz form, skip it -            if len(file_cdx['warc'].split('/')) != 2: -                sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) -                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) -                self.count['skip-warc'] += 1 +            if len(file_cdx["warc"].split("/")) != 2: +                sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"])) +                print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"])) +                self.count["skip-warc"] += 1                  continue              # fetch from GWB/petabox via HTTP range-request -            blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) +            blob, status = self.fetch_warc_content( +                file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"] +            )              if blob is None and status: -                print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) -                self.count['err-petabox-fetch'] += 1 +                print( +                    "{}\terror petabox\t{}\t{}".format( +                        sha1_hex, file_cdx["warc"], status["reason"] +                    ) +                ) +                self.count["err-petabox-fetch"] += 1                  continue              elif not blob:                  print("{}\tskip-empty-blob".format(sha1_hex)) -                self.count['skip-empty-blob'] += 1 +                self.count["skip-empty-blob"] += 1                  continue              # verify sha1              if sha1_hex != hashlib.sha1(blob).hexdigest(): -                #assert sha1_hex == hashlib.sha1(blob).hexdigest() -                #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) +                # assert sha1_hex == hashlib.sha1(blob).hexdigest() +                # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))                  print("{}\terror petabox-hash-mismatch".format(sha1_hex)) -                self.count['err-petabox-hash-mismatch'] += 1 +                self.count["err-petabox-hash-mismatch"] += 1 -            self.count['petabox-ok'] += 1 +            self.count["petabox-ok"] += 1              # upload to AWS S3              obj = self.bucket.put_object( -                Key="{}{}/{}{}".format( -                    self.s3_prefix, -                    sha1_hex[0:4], -                    sha1_hex, -                    self.s3_suffix), -                Body=blob) +                Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix), +                Body=blob, +            )              print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob))) -            self.count['success-s3'] += 1 +            self.count["success-s3"] += 1          sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions +  def main():      parser = argparse.ArgumentParser() -    parser.add_argument('--s3-bucket', -                        required=True, -                        type=str, -                        help='AWS S3 bucket to upload into') -    parser.add_argument('--s3-prefix', -                        type=str, -                        default="pdf/", -                        help='key prefix for items created in bucket') -    parser.add_argument('--s3-suffix', -                        type=str, -                        default=".pdf", -                        help='file suffix for created objects') -    parser.add_argument('--warc-uri-prefix', -                        type=str, -                        default='https://archive.org/serve/', -                        help='URI where WARCs can be found') -    parser.add_argument('manifest_file', -                        help="TSV/JSON manifest file", -                        default=sys.stdin, -                        type=argparse.FileType('r')) +    parser.add_argument( +        "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into" +    ) +    parser.add_argument( +        "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket" +    ) +    parser.add_argument( +        "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects" +    ) +    parser.add_argument( +        "--warc-uri-prefix", +        type=str, +        default="https://archive.org/serve/", +        help="URI where WARCs can be found", +    ) +    parser.add_argument( +        "manifest_file", +        help="TSV/JSON manifest file", +        default=sys.stdin, +        type=argparse.FileType("r"), +    )      args = parser.parse_args() +    sentry_sdk.init() +      worker = DeliverGwbS3(**args.__dict__)      worker.run(args.manifest_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == "__main__":  # pragma: no cover      main() diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py new file mode 100755 index 0000000..aef5c12 --- /dev/null +++ b/python/scripts/doaj2ingestrequest.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Transform an DOAJ article dump (JSON) into ingest requests. + +TODO: should we also attempt PDF ingest for HTML links? They seem to often be +landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url` +in the HTML headers and adds an ingest request on that basis. Or even just run +the re-ingest in-process and publish a second result. +""" + +import argparse +import json +import sys +from typing import List, Optional + +import urlcanon + +DOMAIN_BLOCKLIST = [ +    # large OA publishers (we get via DOI) +    # large repos and aggregators (we crawl directly) +    "://arxiv.org/", +    "://europepmc.org/", +    "ncbi.nlm.nih.gov/", +    # "semanticscholar.org/", +    "://doi.org/", +    "://dx.doi.org/", +    "zenodo.org/", +    "figshare.com/", +    "://archive.org/", +    ".archive.org/", +    # large publishers/platforms; may remove in the future +    # "://link.springer.com/", +    # "://dergipark.gov.tr/", +    # "frontiersin.org/", +    # "scielo", +] + +# these default to PDF; note that we also do pdf ingests for HTML pages +CONTENT_TYPE_MAP = { +    "abstract": [], +    "doc": [], +    "": ["pdf"], +    "doi": ["pdf"], +    "url": ["pdf"], +    "fulltext": ["pdf"], +    "anySimpleType": ["pdf"], +    "application/pdf": ["pdf"], +    "html": ["html", "pdf"], +    "text/html": ["html", "pdf"], +    "xml": ["xml"], +} + + +def canon(s: str) -> str: +    parsed = urlcanon.parse_url(s) +    return str(urlcanon.whatwg(parsed)) + + +def transform(obj: dict) -> List[dict]: +    """ +    Transforms from a single DOAJ object to zero or more ingest requests. +    Returns a list of dicts. +    """ + +    doaj_id = obj["id"].lower() +    assert doaj_id + +    bibjson = obj["bibjson"] +    if not bibjson["link"]: +        return [] + +    requests = [] + +    doi: Optional[str] = None +    for ident in bibjson["identifier"] or []: +        if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."): +            doi = ident["id"].lower() + +    for link in bibjson["link"] or []: +        if link.get("type") != "fulltext" or not link.get("url"): +            continue +        ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower()) +        if not ingest_types: +            continue + +        skip = False +        for domain in DOMAIN_BLOCKLIST: +            if domain in link["url"].lower(): +                skip = True +        if skip: +            continue +        try: +            base_url = canon(link["url"].strip()) +        except UnicodeEncodeError: +            continue + +        if not base_url or len(base_url) > 1000: +            continue + +        for ingest_type in ingest_types: +            request = { +                "base_url": base_url, +                "ingest_type": ingest_type, +                "link_source": "doaj", +                "link_source_id": doaj_id, +                "ingest_request_source": "doaj", +                "release_stage": "published", +                "rel": "publisher", +                "ext_ids": { +                    "doi": doi, +                    "doaj": doaj_id, +                }, +                "edit_extra": {}, +            } +            requests.append(request) + +    return requests + + +def run(args) -> None: +    for l in args.json_file: +        if not l.strip(): +            continue +        row = json.loads(l) + +        requests = transform(row) or [] +        for r in requests: +            print("{}".format(json.dumps(r, sort_keys=True))) + + +def main() -> None: +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r") +    ) +    subparsers = parser.add_subparsers() + +    args = parser.parse_args() + +    run(args) + + +if __name__ == "__main__": +    main() diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py index 9fe1499..44c091c 100755 --- a/python/scripts/enrich_scored_matches.py +++ b/python/scripts/enrich_scored_matches.py @@ -17,29 +17,32 @@ And outputs JSON objects that are can be imported into fatcat with the  No dependencies (only python3 stdlib)  """ -import sys -import json  import base64 +import json +import sys +  def run():      for line in sys.stdin: -        line = line.split('\t') +        line = line.split("\t")          assert len(line) == 5 -        raw_sha1 = line[0].replace('sha1:', '') +        raw_sha1 = line[0].replace("sha1:", "")          dois = json.loads(line[1])          cdx = json.loads(line[2])          mimetype = line[3]          size = int(line[4]) -        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower() +        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()          obj = dict(              sha1=sha1,              dois=dois, -            cdx=[dict(url=cdx['url'], dt=cdx['dt'])], +            cdx=[dict(url=cdx["url"], dt=cdx["dt"])],              size=size, -            mimetype=mimetype) +            mimetype=mimetype, +        )          print(json.dumps(obj)) -if __name__=='__main__': + +if __name__ == "__main__":      run() diff --git a/python/scripts/fetch_cdx_sha1hex.py b/python/scripts/fetch_cdx_sha1hex.py new file mode 100755 index 0000000..2eb56cb --- /dev/null +++ b/python/scripts/fetch_cdx_sha1hex.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +""" +This is a helper script to take fatcat file entities with partial metadata (eg, +missing SHA256) and try to find one or more CDX record where the file may be +found in wayback. + +This script uses the sandcrawler library and should be run like: + +    head file_export.json | python -m scripts.fetch_cdx_sha1hex > results.json +""" + +import base64 +import json +import sys +from typing import List, Optional + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error + +from sandcrawler.ia import CdxApiClient, cdx_to_dict + + +def requests_retry_session( +    retries: int = 10, +    backoff_factor: int = 3, +    status_forcelist: List[int] = [500, 502, 504], +    session: requests.Session = None, +) -> requests.Session: +    """ +    From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests +    """ +    session = session or requests.Session() +    retry = Retry( +        total=retries, +        read=retries, +        connect=retries, +        backoff_factor=backoff_factor, +        status_forcelist=status_forcelist, +    ) +    adapter = HTTPAdapter(max_retries=retry) +    session.mount("http://", adapter) +    session.mount("https://", adapter) +    return session + + +def b32_hex(s: str) -> str: +    """ +    Converts a base32-encoded SHA-1 checksum into hex-encoded + +    base32 checksums are used by, eg, heritrix and in wayback CDX files +    """ +    s = s.strip().split()[0].lower() +    if s.startswith("sha1:"): +        s = s[5:] +    if len(s) != 32: +        if len(s) == 40: +            return s +        raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) +    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") + + +SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030" + + +def get_db_cdx(sha1hex: str, http_session) -> List[dict]: +    resp = http_session.get( +        SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(sha1hex="eq." + sha1hex) +    ) +    resp.raise_for_status() +    rows = resp.json() +    return rows or [] + + +CDX_API_URL = "https://web.archive.org/cdx/search/cdx" + + +def get_api_cdx(url: str, sha1hex: str, cdx_api) -> Optional[dict]: + +    params = { +        "url": url, +        "output": "json", +        "matchType": "exact", +        "limit": 20, +        # TODO: group-by digest/checksum? +        # can't filter status because might be warc/revisit +        # "filter": "statuscode:200", +    } +    rows = cdx_api._query_api(params) +    if not rows: +        return None +    for row in rows: +        if row.sha1hex == sha1hex: +            return row +    return None + + +def process_file(fe, session, cdx_api) -> dict: +    status = "unknown" + +    # simple CDX db lookup first +    cdx_row_list = get_db_cdx(fe["sha1"], http_session=session) +    if cdx_row_list: +        return dict( +            file_entity=fe, +            cdx_rows=cdx_row_list, +            status="success-db", +        ) + +    original_urls = [] +    for pair in fe["urls"]: +        u = pair["url"] +        if not "://web.archive.org/web/" in u: +            continue +        seg = u.split("/") +        assert seg[2] == "web.archive.org" +        assert seg[3] == "web" +        if not seg[4].isdigit(): +            continue +        original_url = "/".join(seg[5:]) +        original_urls.append(original_url) + +    if len(original_urls) == 0: +        return dict(file_entity=fe, status="skip-no-urls") + +    found_cdx_rows = [] +    for url in list(set(original_urls)): + +        cdx_record = None +        try: +            cdx_record = get_api_cdx(original_url, sha1hex=fe["sha1"], cdx_api=cdx_api) +        except requests.exceptions.HTTPError as e: +            if e.response.status_code == 403: +                return dict(file_entity=fe, status="fail-cdx-403") +            else: +                raise +        if cdx_record and cdx_record.sha1hex == fe["sha1"]: +            found_cdx_rows.append(cdx_to_dict(cdx_record)) + +    if found_cdx_rows: +        return dict( +            file_entity=fe, +            cdx_rows=found_cdx_rows, +            status="success-api", +        ) + +    return dict( +        file_entity=fe, +        status="fail-not-found", +    ) + + +def main(): +    session = requests_retry_session() +    session.headers.update( +        { +            "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot", +        } +    ) +    cdx_api = CdxApiClient() +    for line in sys.stdin: +        if not line.strip(): +            continue +        fe = json.loads(line) +        print(json.dumps(process_file(fe, session=session, cdx_api=cdx_api))) + + +if __name__ == "__main__": +    main() diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index c33ab86..8fce0d9 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -1,43 +1,48 @@  #!/usr/bin/env python3 -import sys  import json +import sys -with open('title_slug_blacklist.txt', 'r') as f: -    TITLE_BLACKLIST = [l.strip() for l in f] - -TITLE_BLACKLIST.extend(( -    'editorial', -    'advertisement', -    'bookreviews', -    'reviews', -    'nr', -    'abstractoriginalarticle', -    'originalarticle', -    'impactfactor', -    'articlenumber', -)) +with open("title_slug_denylist.txt", "r") as f: +    TITLE_DENYLIST = [l.strip() for l in f] + +TITLE_DENYLIST.extend( +    ( +        "editorial", +        "advertisement", +        "bookreviews", +        "reviews", +        "nr", +        "abstractoriginalarticle", +        "originalarticle", +        "impactfactor", +        "articlenumber", +    ) +)  # The full name can't *entirely* be one of these -NAME_BLACKLIST = ( -    'phd', -    'phdstudent', +NAME_DENYLIST = ( +    "phd", +    "phdstudent",  ) +  def tokenize(s, remove_whitespace=True): -    s.replace(''', "'") +    s.replace("'", "'")      # Remove non-alphanumeric characters -    s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()]) +    s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()])      if remove_whitespace: -        s = ''.join(s.split()) +        s = "".join(s.split())      # Encode as dumb ASCII (TODO: this is horrible) -    return s.encode('ascii', 'replace').decode('utf8').replace('?', '') +    return s.encode("ascii", "replace").decode("utf8").replace("?", "") +  assert tokenize("Impact Factor: 2.114") == "impactfactor" -assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST +assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST +  def filter_title(title): @@ -45,16 +50,16 @@ def filter_title(title):      if len(title) > 500:          return None      title_slug = tokenize(title, remove_whitespace=True) -    if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST: +    if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:          return None -    if title_slug.startswith('nr'): +    if title_slug.startswith("nr"):          return None -    if title.lower().replace('.', '').startswith('int j '): +    if title.lower().replace(".", "").startswith("int j "):          return None      for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):          if title.startswith(prefix): -            title.replace(prefix, '') +            title.replace(prefix, "")      if title.startswith("The Journal of "):          return None @@ -78,63 +83,84 @@ def filter_title(title):          return None      # too deep subtitling/splitting -    if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1: +    if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1:          return None      return title +  def filter_author_name(name): -    name = name['name'] -    if name.strip().lower().replace(' ', '') in NAME_BLACKLIST: +    name = name["name"] +    if name.strip().lower().replace(" ", "") in NAME_DENYLIST:          return None -    return ' '.join([t for t in name.split() if tokenize(t)]) +    return " ".join([t for t in name.split() if tokenize(t)]) +  def filter_authors(l):      return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1] +  def filter_refs(l):      # TODO:      return l +  def filter_journal_name(name): -    # same blacklist, for now +    # same denylist, for now      if not name:          return None -    name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') +    name = name.replace(" e-ISSN", "").replace(" p-ISSN", "")      slug_name = tokenize(name) -    if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º": -        return None -    for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): +    if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º": +        return None +    for prefix in ( +        "/ ", +        "~ ", +        "& ", +        "© ", +        "Original Research Article ", +        "Original Article ", +        "Research Article ", +        "Available online www.jocpr.com ", +    ):          if name.startswith(prefix): -            name = name.replace(prefix, '') -    for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"): +            name = name.replace(prefix, "") +    for suffix in ( +        " Available online at www.sciarena.com", +        " Original Article", +        " Available online at", +        " ISSN", +        " ISSUE", +    ):          if name.endswith(suffix): -            name = name.replace(suffix, '') +            name = name.replace(suffix, "")      if "====================" in name:          return None      if len(name) > 150:          return None -    return ' '.join(name.split()) +    return " ".join(name.split()) +  def filter_metadata(obj): -    if not (obj.get('title') and obj.get('authors')): +    if not (obj.get("title") and obj.get("authors")):          return None -    title = filter_title(obj['title']) +    title = filter_title(obj["title"])      if not title: -        #sys.stderr.write("bad title\n") +        # sys.stderr.write("bad title\n")          return None      else: -        obj['title'] = title -    obj['authors'] = filter_authors(obj['authors']) -    obj['citations'] = filter_refs(obj['citations']) -    obj['journal']['name'] = filter_journal_name(obj['journal']['name']) +        obj["title"] = title +    obj["authors"] = filter_authors(obj["authors"]) +    obj["citations"] = filter_refs(obj["citations"]) +    obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"])      return obj +  def run(invert=False):      for line in sys.stdin: -        fields = line.split('\t') +        fields = line.split("\t")          if len(fields) == 5:              raw = fields[4]          elif len(fields) == 1: @@ -151,9 +177,10 @@ def run(invert=False):                      fields[4] = processed                  else:                      fields[0] = processed -                print('\t'.join(fields)) +                print("\t".join(fields))          elif invert:              print(raw.strip()) -if __name__=="__main__": + +if __name__ == "__main__":      run(invert="--invert" in sys.argv) diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py index bbba770..87dae16 100755 --- a/python/scripts/filter_groupworks.py +++ b/python/scripts/filter_groupworks.py @@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:  - dates differ (not just year)  """ -import sys  import json +import sys  # out of 1000  SCORE_THRESHOLD = 900 @@ -28,17 +28,19 @@ MAX_SLUG_LINES = 50  REQUIRE_AUTHORS = False +  def tokenize(s, remove_whitespace=False): -    s.replace(''', "'") +    s.replace("'", "'")      # Remove non-alphanumeric characters -    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) +    s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])      if remove_whitespace: -        s = ''.join(s.split()) +        s = "".join(s.split())      # Encode as dumb ASCII (TODO: this is horrible) -    return s.encode('ascii', 'replace').replace(b'?', b'') +    return s.encode("ascii", "replace").replace(b"?", b"") +  def check_authors(left, right):      """ @@ -51,7 +53,7 @@ def check_authors(left, right):          return False      right_all = tokenize(" ".join(right))      for i in range(len(left)): -        l = left[i].lower().replace('jr.', '').split() +        l = left[i].lower().replace("jr.", "").split()          if not l:              return False          l = tokenize(l[-1]) @@ -59,20 +61,22 @@ def check_authors(left, right):              # weird author name (single char)              return False          if l not in right_all: -            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) +            # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))              return False      return True +  def test_check_authors():      assert check_authors([], []) == bool(not REQUIRE_AUTHORS) -    assert not check_authors([], ['one']) -    assert check_authors(['one'], ['one']) -    assert check_authors(['one two'], ['One Two']) -    assert check_authors(['two'], ['One Two']) -    assert check_authors(['two'], ['two, one']) -    assert check_authors(['mago'], ['Mr. Magoo']) -    assert check_authors(['Mr. Magoo'], ['Mr Magoo']) -    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) +    assert not check_authors([], ["one"]) +    assert check_authors(["one"], ["one"]) +    assert check_authors(["one two"], ["One Two"]) +    assert check_authors(["two"], ["One Two"]) +    assert check_authors(["two"], ["two, one"]) +    assert check_authors(["mago"], ["Mr. Magoo"]) +    assert check_authors(["Mr. Magoo"], ["Mr Magoo"]) +    assert check_authors(["one", "tw", "thr"], ["one", "two", "three"]) +  # Rows are (score, left, right)  def process_group(rows): @@ -86,10 +90,10 @@ def process_group(rows):          left = json.loads(row[1])          right = json.loads(row[2])          # authors must roughly match -        if not check_authors(left['authors'], right['authors']): +        if not check_authors(left["authors"], right["authors"]):              continue          # years must match (if defined) -        if left['year'] and right['year'] and left['year'] != right['year']: +        if left["year"] and right["year"] and left["year"] != right["year"]:              continue          filtered.append((left, right)) @@ -101,8 +105,8 @@ def process_group(rows):      group_ids = set()      for row in filtered[1:]:          (left, right) = row -        l_id = left['fatcat_release'] -        r_id = right['fatcat_release'] +        l_id = left["fatcat_release"] +        r_id = right["fatcat_release"]          releases[l_id] = left          releases[r_id] = right          if not group_ids: @@ -119,6 +123,7 @@ def process_group(rows):      print(json.dumps([releases[ident] for ident in group_ids])) +  def run():      last_slug = None @@ -126,7 +131,7 @@ def run():      # group lines by slug, and process in batches      for line in sys.stdin: -        line = line.strip().split('\t') +        line = line.strip().split("\t")          assert len(line) == 4          slug = line[0]          if last_slug and slug != last_slug and lines: @@ -140,5 +145,6 @@ def run():      if lines:          process_group(lines) -if __name__=='__main__': + +if __name__ == "__main__":      run() diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py index 3654b87..c5b7eef 100755 --- a/python/scripts/filter_scored_matches.py +++ b/python/scripts/filter_scored_matches.py @@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).  No dependencies (only python3 stdlib)  """ -import sys  import json +import sys  # out of 1000  score_threshold = 900 @@ -23,15 +23,16 @@ require_authors = 1  def tokenize(s, remove_whitespace=False): -    s.replace(''', "'") +    s.replace("'", "'")      # Remove non-alphanumeric characters -    s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) +    s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])      if remove_whitespace: -        s = ''.join(s.split()) +        s = "".join(s.split())      # Encode as dumb ASCII (TODO: this is horrible) -    return s.encode('ascii', 'replace').replace(b'?', b'') +    return s.encode("ascii", "replace").replace(b"?", b"") +  def check_authors(left, right):      """ @@ -44,7 +45,7 @@ def check_authors(left, right):          return False      right_all = tokenize(" ".join(right))      for i in range(len(left)): -        l = left[i].lower().replace('jr.', '').split() +        l = left[i].lower().replace("jr.", "").split()          if not l:              return False          l = tokenize(l[-1]) @@ -52,20 +53,22 @@ def check_authors(left, right):              # weird author name (single char)              return False          if l not in right_all: -            #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) +            # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))              return False      return True +  def test_check_authors():      assert not check_authors([], []) -    assert not check_authors([], ['one']) -    assert check_authors(['one'], ['one']) -    assert check_authors(['one two'], ['One Two']) -    assert check_authors(['two'], ['One Two']) -    assert check_authors(['two'], ['two, one']) -    assert check_authors(['mago'], ['Mr. Magoo']) -    assert check_authors(['Mr. Magoo'], ['Mr Magoo']) -    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) +    assert not check_authors([], ["one"]) +    assert check_authors(["one"], ["one"]) +    assert check_authors(["one two"], ["One Two"]) +    assert check_authors(["two"], ["One Two"]) +    assert check_authors(["two"], ["two, one"]) +    assert check_authors(["mago"], ["Mr. Magoo"]) +    assert check_authors(["Mr. Magoo"], ["Mr Magoo"]) +    assert check_authors(["one", "tw", "thr"], ["one", "two", "three"]) +  # Rows are (score, grobid, crossref)  def process_group(rows): @@ -78,20 +81,21 @@ def process_group(rows):              continue          grobid = json.loads(row[1])          crossref = json.loads(row[2]) -        if not check_authors(crossref['authors'], grobid['authors']): -            #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) +        if not check_authors(crossref["authors"], grobid["authors"]): +            # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))              continue          else: -            #print("YES: {} {}".format(crossref['authors'], grobid['authors'])) +            # print("YES: {} {}".format(crossref['authors'], grobid['authors']))              pass -        sha1 = grobid['sha1'] -        doi = crossref['doi'].lower() +        sha1 = grobid["sha1"] +        doi = crossref["doi"].lower()          l = keepers.get(sha1, list())          l.append(doi)          keepers[sha1] = l      for sha1, doi_list in keepers.items():          print("{}\t{}".format(sha1, json.dumps(doi_list))) +  def run():      last_slug = None @@ -99,7 +103,7 @@ def run():      # group lines by slug, and process in batches      for line in sys.stdin: -        line = line.strip().split('\t') +        line = line.strip().split("\t")          assert len(line) == 4          slug = line[0]          if last_slug and slug != last_slug and lines: @@ -112,5 +116,6 @@ def run():      if lines:          process_group(lines) -if __name__=='__main__': + +if __name__ == "__main__":      run() diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index 79feac1..90a0f77 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -1,5 +1,4 @@  #!/usr/bin/env python3 -  """  Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction  output, converts the XML to JSON, filters out raw affiliation strings, and @@ -10,43 +9,49 @@ Run in bulk like:      ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'  """ -import sys  import json +import sys + +from grobid_tei_xml import parse_document_xml -from grobid2json import teixml2json  def parse_hbase(line): -    line = line.split('\t') +    line = line.split("\t")      assert len(line) == 2      sha1hex = line[0]      obj = json.loads(line[1]) -    tei_xml = obj['tei_xml'] +    tei_xml = obj["tei_xml"]      return sha1hex, tei_xml +  def parse_pg(line):      obj = json.loads(line) -    return obj['sha1hex'], obj['tei_xml'] +    return obj["sha1hex"], obj["tei_xml"] + -def run(mode='hbase'): +def run(mode="hbase"):      for line in sys.stdin: -        if mode == 'hbase': +        if mode == "hbase":              sha1hex, tei_xml = parse_hbase(line) -        elif mode == 'pg': +        elif mode == "pg":              sha1hex, tei_xml = parse_pg(line)          else: -            raise NotImplementedError('parse mode: {}'.format(mode)) +            raise NotImplementedError("parse mode: {}".format(mode)) -        obj = teixml2json(tei_xml, encumbered=False) +        tei_doc = parse_document_xml(tei_xml) +        tei_doc.remove_encumbered() +        obj = tei_doc.to_legacy_dict()          affiliations = [] -        for author in obj['authors']: -            if author.get('affiliation'): -                affiliations.append(author['affiliation']) +        for author in obj["authors"]: +            if author.get("affiliation"): +                affiliations.append(author["affiliation"])          if affiliations:              # don't duplicate affiliations; only the unique ones              affiliations = list(set([json.dumps(a) for a in affiliations]))              affiliations = [json.loads(a) for a in affiliations] -            print('\t'.join([sha1hex, json.dumps(affiliations)])) +            print("\t".join([sha1hex, json.dumps(affiliations)])) + -if __name__=='__main__': +if __name__ == "__main__":      run() diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py index 3d2e14c..f941881 100755 --- a/python/scripts/import_grobid_metadata.py +++ b/python/scripts/import_grobid_metadata.py @@ -1,69 +1,67 @@  #!/usr/bin/env python3 -import sys -import json  import datetime +import json +import sys + +MAX_ABSTRACT_BYTES = 4096 -MAX_ABSTRACT_BYTES=4096  def parse_grobid_json(obj): -    if not obj.get('title'): +    if not obj.get("title"):          return None      extra = dict() -    if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: -        abobj = dict( -            mimetype="text/plain", -            language=None, -            content=obj.get('abstract').strip()) +    if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES: +        abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())          abstracts = [abobj]      else:          abstracts = None      contribs = [] -    for a in obj.get('authors', []): +    for a in obj.get("authors", []):          c = dict(raw_name=a, role="author")          contribs.append(c)      refs = [] -    for raw in obj.get('citations', []): +    for raw in obj.get("citations", []):          extra = dict()          ref = dict() -        ref['key'] = raw.get('id') -        if raw.get('title'): -            ref['title'] = raw['title'].strip() -        if raw.get('date'): +        ref["key"] = raw.get("id") +        if raw.get("title"): +            ref["title"] = raw["title"].strip() +        if raw.get("date"):              try: -                year = int(raw['date'].strip()[:4]) -                ref['year'] = year +                year = int(raw["date"].strip()[:4]) +                ref["year"] = year              except:                  pass -        for key in ('volume', 'url', 'issue', 'publisher'): +        for key in ("volume", "url", "issue", "publisher"):              if raw.get(key):                  extra[key] = raw[key].strip() -        if raw.get('authors'): -            extra['authors'] = [a['name'] for a in raw['authors']] +        if raw.get("authors"): +            extra["authors"] = [a["name"] for a in raw["authors"]]          if extra:              extra = dict(grobid=extra)          else:              extra = None -        ref['extra'] = extra +        ref["extra"] = extra          refs.append(ref)      release_type = "journal-article"      release_date = None -    if obj.get('date'): +    if obj.get("date"):          # TODO: only returns year, ever? how to handle? -        release_date = datetime.datetime(year=obj['date'], month=1, day=1) +        release_date = datetime.datetime(year=obj["date"], month=1, day=1) -    if obj.get('doi'): -        extra['doi'] = obj['doi'] -    if obj['journal'].get('name'): -        extra['container_name'] = obj['journal']['name'] +    if obj.get("doi"): +        extra["doi"] = obj["doi"].lower() +    if obj["journal"].get("name"): +        extra["container_name"] = obj["journal"]["name"] -    extra['is_longtail_oa'] = True +    extra["is_longtail_oa"] = True      # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -73,15 +71,17 @@ def parse_grobid_json(obj):          extra = None      return dict( -        title=obj['title'].strip(), +        title=obj["title"].strip(),          contribs=contribs, -        publisher=obj['journal'].get('publisher'), -        volume=obj['journal'].get('volume'), -        issue=obj['journal'].get('issue'), +        publisher=obj["journal"].get("publisher"), +        volume=obj["journal"].get("volume"), +        issue=obj["journal"].get("issue"),          abstracts=abstracts,          release_type=release_type,          release_date=release_date, -        extra=extra) +        extra=extra, +    ) +  def run():      for line in sys.stdin: @@ -90,5 +90,6 @@ def run():          if out:              print(out) -if __name__=="__main__": + +if __name__ == "__main__":      run() diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index 494ec7a..8a353ca 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -1,5 +1,4 @@  #!/usr/bin/env python3 -  """  This script is used to turn ingest request postgres rows (in JSON export  format) back in to regular ingest request JSON. @@ -7,24 +6,25 @@ format) back in to regular ingest request JSON.  The only difference is the name and location of some optional keys.  """ -import sys -import json  import argparse +import json +import sys  def transform(row):      """      dict-to-dict      """ -    row.pop('created', None) -    extra = row.pop('request', None) or {} -    for k in ('ext_ids', 'edit_extra'): +    row.pop("created", None) +    extra = row.pop("request", None) or {} +    for k in ("ext_ids", "edit_extra"):          if k in extra:              row[k] = extra[k] -    if 'release_ident' in extra: -        row['fatcat'] = dict(release_ident=extra['release_ident']) +    if "release_ident" in extra: +        row["fatcat"] = dict(release_ident=extra["release_ident"])      return row +  def run(args):      for l in args.json_file:          if not l.strip(): @@ -33,19 +33,27 @@ def run(args):              req = transform(json.loads(l))          except:              print(l, file=sys.stderr) +        if args.force_recrawl: +            req["force_recrawl"] = True          print(json.dumps(req, sort_keys=True)) +  def main(): -    parser = argparse.ArgumentParser( -        formatter_class=argparse.ArgumentDefaultsHelpFormatter) -    parser.add_argument('json_file', -        help="arabesque output file to use", -        type=argparse.FileType('r')) +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "json_file", help="SQL output JSON file to process", type=argparse.FileType("r") +    ) +    parser.add_argument( +        "--force-recrawl", +        action="store_true", +        help="whether to add recrawl (SPNv2) flag to request", +    )      subparsers = parser.add_subparsers()      args = parser.parse_args()      run(args) -if __name__ == '__main__': + +if __name__ == "__main__":      main() diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py index 35cee5b..24e22fd 100755 --- a/python/scripts/manifest_converter.py +++ b/python/scripts/manifest_converter.py @@ -10,9 +10,9 @@ This was used to convert this manifest:  to JSON format for fast fatcat importing.  """ -import sys  import json  import sqlite3 +import sys  # iterate over rows in files metadata...  # 1. select all identified DOIs @@ -20,6 +20,7 @@ import sqlite3  # 2. select all file metadata  # 3. output object +  def or_none(s):      if s is None:          return None @@ -27,6 +28,7 @@ def or_none(s):          return None      return s +  def process_db(db_path):      db = sqlite3.connect(db_path) @@ -52,5 +54,6 @@ def process_db(db_path):          dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])          print(json.dumps(obj)) -if __name__=="__main__": + +if __name__ == "__main__":      process_db(sys.argv[1]) diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py index 916f41c..97c38f9 100755 --- a/python/scripts/oai2ingestrequest.py +++ b/python/scripts/oai2ingestrequest.py @@ -1,19 +1,18 @@  #!/usr/bin/env python3 -  """  Transform an OAI-PMH bulk dump (JSON) into ingest requests.  Eg: https://archive.org/details/oai_harvest_20200215  """ -import sys -import json  import argparse +import json +import sys +  import urlcanon  DOMAIN_BLOCKLIST = [      # large OA publishers (we get via DOI) -      # large repos and aggregators (we crawl directly)      "://arxiv.org/",      "://europepmc.org/", @@ -26,23 +25,54 @@ DOMAIN_BLOCKLIST = [      "://archive.org/",      ".archive.org/",      "://127.0.0.1/", - +    "://www.kb.dk/", +    "://kb-images.kb.dk/", +    "://mdz-nbn-resolving.de/", +    "://aggr.ukm.um.si/", +    "://edoc.mpg.de/", +    "doaj.org/", +    "orcid.org/", +    "://gateway.isiknowledge.com/",      # OAI specific additions      "://hdl.handle.net/",  ] +# OAI identifier prefixes for repositories that we want to skip (for various reasons) +OAI_BLOCKLIST = [ +    "oai:kb.dk:", +    "oai:bdr.oai.bsb-muenchen.de:", +    "oai:hispana.mcu.es:", +    "oai:bnf.fr:", +    "oai:ukm.si:", +    "oai:biodiversitylibrary.org:", +    "oai:hsp.org:", +    "oai:repec:", +    "oai:n/a:", +    "oai:quod.lib.umich.edu:", +    "oai:americanae.aecid.es:", +    "oai:www.irgrid.ac.cn:", +    "oai:espace.library.uq.edu:", +    "oai:edoc.mpg.de:", +    "oai:bibliotecadigital.jcyl.es:", +    "oai:repository.erciyes.edu.tr:", +    "oai:krm.or.kr:", +    "oai:hypotheses.org:%", +] +  RELEASE_STAGE_MAP = { -    'info:eu-repo/semantics/draftVersion':     'draft', -    'info:eu-repo/semantics/submittedVersion': 'submitted', -    'info:eu-repo/semantics/acceptedVersion':  'accepted', -    'info:eu-repo/semantics/publishedVersion': 'published', -    'info:eu-repo/semantics/updatedVersion':   'updated', +    "info:eu-repo/semantics/draftVersion": "draft", +    "info:eu-repo/semantics/submittedVersion": "submitted", +    "info:eu-repo/semantics/acceptedVersion": "accepted", +    "info:eu-repo/semantics/publishedVersion": "published", +    "info:eu-repo/semantics/updatedVersion": "updated",  } +  def canon(s):      parsed = urlcanon.parse_url(s)      return str(urlcanon.whatwg(parsed)) +  def transform(obj):      """      Transforms from a single OAI-PMH object to zero or more ingest requests. @@ -50,38 +80,43 @@ def transform(obj):      """      requests = [] -    if not obj.get('oai') or not obj['oai'].startswith('oai:'): +    if not obj.get("oai") or not obj["oai"].startswith("oai:"):          return [] -    if not obj.get('urls'): +    if not obj.get("urls"):          return [] +    oai_id = obj["oai"].lower() +    for prefix in OAI_BLOCKLIST: +        if oai_id.startswith(prefix): +            return [] +      # look in obj['formats'] for PDF? -    if obj.get('formats'): +    if obj.get("formats"):          # if there is a list of formats, and it does not contain PDF, then          # skip. Note that we will continue if there is no formats list.          has_pdf = False -        for f in obj['formats']: -            if 'pdf' in f.lower(): +        for f in obj["formats"]: +            if "pdf" in f.lower():                  has_pdf = True          if not has_pdf:              return []      doi = None -    if obj.get('doi'): -        doi = obj['doi'][0].lower().strip() -        if not doi.startswith('10.'): +    if obj.get("doi"): +        doi = obj["doi"][0].lower().strip() +        if not doi.startswith("10."):              doi = None      # infer release stage and/or type from obj['types']      release_stage = None -    for t in obj.get('types', []): +    for t in obj.get("types", []):          if t in RELEASE_STAGE_MAP:              release_stage = RELEASE_STAGE_MAP[t]      # TODO: infer rel somehow? Eg, repository vs. OJS publisher      rel = None -    for url in obj['urls']: +    for url in obj["urls"]:          skip = False          for domain in DOMAIN_BLOCKLIST:              if domain in url: @@ -94,23 +129,25 @@ def transform(obj):              continue          request = { -            'base_url': base_url, -            'ingest_type': 'pdf', -            'link_source': 'oai', -            'link_source_id': obj['oai'].lower(), -            'ingest_request_source': 'metha-bulk', -            'release_stage': release_stage, -            'rel': rel, -            'ext_ids': { -                'doi': doi, -                'oai': obj['oai'].lower(), +            "base_url": base_url, +            "ingest_type": "pdf", +            "link_source": "oai", +            "link_source_id": oai_id, +            "ingest_request_source": "metha-bulk", +            "release_stage": release_stage, +            "rel": rel, +            "ext_ids": { +                "oai": obj["oai"].lower(),              }, -            'edit_extra': {}, +            "edit_extra": {},          } +        if doi: +            request["ext_ids"]["doi"] = doi          requests.append(request)      return requests +  def run(args):      for l in args.json_file:          if not l.strip(): @@ -121,17 +158,20 @@ def run(args):          for r in requests:              print("{}".format(json.dumps(r, sort_keys=True))) +  def main(): -    parser = argparse.ArgumentParser( -        formatter_class=argparse.ArgumentDefaultsHelpFormatter) -    parser.add_argument('json_file', +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "json_file",          help="OAI-PMH dump file to use (usually stdin)", -        type=argparse.FileType('r')) +        type=argparse.FileType("r"), +    )      subparsers = parser.add_subparsers()      args = parser.parse_args()      run(args) -if __name__ == '__main__': + +if __name__ == "__main__":      main() diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py index e093dc3..8b57c5b 100755 --- a/python/scripts/pdf_thumbnail.py +++ b/python/scripts/pdf_thumbnail.py @@ -1,5 +1,4 @@  #!/usr/bin/env python3 -  """  Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc). @@ -7,6 +6,7 @@ Originally used to benchmark and compare file size/quality.  """  import sys +  import poppler  from PIL import Image @@ -22,13 +22,16 @@ def run(inpath, outpath):      renderer = poppler.PageRenderer()      full_page = renderer.render_page(page) -    img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "RGBA", 0, 1) -    img.thumbnail((180,300), Image.BICUBIC) -    #img.thumbnail((360,600), Image.BICUBIC) +    img = Image.frombuffer( +        "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1 +    ) +    img.thumbnail((180, 300), Image.BICUBIC) +    # img.thumbnail((360,600), Image.BICUBIC)      img.save(outpath) -    #img.save(outpath, quality=95) +    # img.save(outpath, quality=95) + -if __name__ == '__main__': +if __name__ == "__main__":      if len(sys.argv) != 3:          print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)          sys.exit(-1) diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index 5536e6c..cb64a1a 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -1,41 +1,39 @@  #!/usr/bin/env python3 -  """  Transform an unpaywall dump (JSON) into ingest requests.  """ -import sys -import json  import argparse +import json +import sys +  import urlcanon  DOMAIN_BLOCKLIST = [      # large OA publishers (we get via DOI) -      # large repos and aggregators (we crawl directly)      "://arxiv.org/",      "://europepmc.org/",      "ncbi.nlm.nih.gov/", -    "semanticscholar.org/",      "://doi.org/",      "zenodo.org/",      "figshare.com/", -    "://archive.org/", -    ".archive.org/",  ]  RELEASE_STAGE_MAP = { -    'draftVersion':     'draft', -    'submittedVersion': 'submitted', -    'acceptedVersion':  'accepted', -    'publishedVersion': 'published', -    'updatedVersion':   'updated', +    "draftVersion": "draft", +    "submittedVersion": "submitted", +    "acceptedVersion": "accepted", +    "publishedVersion": "published", +    "updatedVersion": "updated",  } +  def canon(s):      parsed = urlcanon.parse_url(s)      return str(urlcanon.whatwg(parsed)) +  def transform(obj):      """      Transforms from a single unpaywall object to zero or more ingest requests. @@ -43,48 +41,49 @@ def transform(obj):      """      requests = [] -    if not obj['doi'].startswith('10.'): +    if not obj["doi"].startswith("10."):          return requests -    if not obj['oa_locations']: +    if not obj["oa_locations"]:          return requests -    for location in obj['oa_locations']: -        if not location['url_for_pdf']: +    for location in obj["oa_locations"]: +        if not location["url_for_pdf"]:              continue          skip = False          for domain in DOMAIN_BLOCKLIST: -            if domain in location['url_for_pdf']: +            if domain in location["url_for_pdf"]:                  skip = True          if skip:              continue          try: -            base_url = canon(location['url_for_pdf']) +            base_url = canon(location["url_for_pdf"])          except UnicodeEncodeError:              continue          request = { -            'base_url': base_url, -            'ingest_type': 'pdf', -            'link_source': 'unpaywall', -            'link_source_id': obj['doi'].lower(), -            'ingest_request_source': 'unpaywall', -            'release_stage': RELEASE_STAGE_MAP.get(location['version']), -            'rel': location['host_type'], -            'ext_ids': { -                'doi': obj['doi'].lower(), +            "base_url": base_url, +            "ingest_type": "pdf", +            "link_source": "unpaywall", +            "link_source_id": obj["doi"].lower(), +            "ingest_request_source": "unpaywall", +            "release_stage": RELEASE_STAGE_MAP.get(location["version"]), +            "rel": location["host_type"], +            "ext_ids": { +                "doi": obj["doi"].lower(),              }, -            'edit_extra': {}, +            "edit_extra": {},          } -        if obj.get('oa_status'): -            request['edit_extra']['oa_status'] = obj['oa_status'] -        if location.get('evidence'): -            request['edit_extra']['evidence'] = location['evidence'] -        if location['pmh_id']: -            request['ext_ids']['pmh_id'] = location['pmh_id'] +        if obj.get("oa_status"): +            request["edit_extra"]["oa_status"] = obj["oa_status"] +        if location.get("evidence"): +            request["edit_extra"]["evidence"] = location["evidence"] +        if location["pmh_id"]: +            request["ext_ids"]["pmh_id"] = location["pmh_id"]          requests.append(request)      return requests +  def run(args):      for l in args.json_file:          if not l.strip(): @@ -95,17 +94,18 @@ def run(args):          for r in requests:              print("{}".format(json.dumps(r, sort_keys=True))) +  def main(): -    parser = argparse.ArgumentParser( -        formatter_class=argparse.ArgumentDefaultsHelpFormatter) -    parser.add_argument('json_file', -        help="unpaywall dump file to use", -        type=argparse.FileType('r')) +    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +    parser.add_argument( +        "json_file", help="unpaywall dump file to use", type=argparse.FileType("r") +    )      subparsers = parser.add_subparsers()      args = parser.parse_args()      run(args) -if __name__ == '__main__': + +if __name__ == "__main__":      main() | 
