diff options
Diffstat (limited to 'python/scripts')
20 files changed, 1116 insertions, 559 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index 03a1f29..4561541 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ This script is intended to be used for backfill ingest of old crawls. It can also be used as a fast path for getting freshly crawled content into fatcat if @@ -12,9 +11,9 @@ Run like: Can then run through requests using that tool, or dump into kafka queue. """ -import sys -import json import argparse +import json +import sys def run(args): @@ -22,51 +21,54 @@ def run(args): if not l.strip(): continue row = json.loads(l) - if not row['hit']: + if not row["hit"]: continue request = { - 'base_url': row['final_url'], - 'ingest_type': args.ingest_type, - 'link_source': args.link_source, - 'link_source_id': row['identifier'], - 'ingest_request_source': args.ingest_request_source, - 'ext_ids': { - args.extid_type: row['identifier'], + "base_url": row["final_url"], + "ingest_type": args.ingest_type, + "link_source": args.link_source, + "link_source_id": row["identifier"], + "ingest_request_source": args.ingest_request_source, + "ext_ids": { + args.extid_type: row["identifier"], }, } if args.release_stage: - assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update') - request['release_stage'] = args.release_stage + assert args.release_stage in ( + "published", + "submitted", + "accepted", + "draft", + "update", + ) + request["release_stage"] = args.release_stage print("{}".format(json.dumps(request, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--link-source', - required=True, - help="link_source to include in request") - parser.add_argument('--extid-type', - required=True, - help="extid to encode identifier as") - parser.add_argument('--ingest-type', - default="pdf", - help="ingest type (pdf, html, xml, etc)") - parser.add_argument('--ingest-request-source', - default="arabesque", - help="to include in request") - parser.add_argument('--release-stage', - default=None, - help="to include in request") - parser.add_argument('json_file', - help="arabesque output file to use", - type=argparse.FileType('r')) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--link-source", required=True, help="link_source to include in request" + ) + parser.add_argument("--extid-type", required=True, help="extid to encode identifier as") + parser.add_argument( + "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)" + ) + parser.add_argument( + "--ingest-request-source", default="arabesque", help="to include in request" + ) + parser.add_argument("--release-stage", default=None, help="to include in request") + parser.add_argument( + "json_file", help="arabesque output file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py new file mode 100755 index 0000000..6328f52 --- /dev/null +++ b/python/scripts/archiveorg_fileset.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Helper script to + +Takes either two args (release ident and archive.org item), or a stream of +tab-separated such pairs on stdin. + +TODO: +- should this check the item type? +""" + +import json +import sys +from typing import Any + +import internetarchive + +FORMAT_TO_MIMETYPE = { + "BZIP": "application/x-bzip", + "BZIP2": "application/x-bzip2", + "ZIP": "application/zip", + "GZIP": "application/gzip", + "RAR": "application/vnd.rar", + "TAR": "application/x-tar", + "7z": "application/x-7z-compressed", + "HTML": "text/html", + "Text": "text/plain", + "PDF": "application/pdf", + "CSV": "text/csv", + "XML": "application/xml", + "JSON": "application/json", + #'application/msword (.doc)', # .doc + #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx + #'application/vnd.ms-excel', # .xls + #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx + "MP3": "audio/mpeg", # .mp3 + "MP4": "video/mp4", # .mp4 + "MPEG": "video/mpeg", # .mpeg + "JPEG": "image/jpeg", + "GIF": "image/gif", + "PNG": "image/png", + "TIFF": "image/tiff", + "Unknown": None, +} + + +def want_file(f: dict, item_name: str) -> bool: + """ + Filters IA API files + """ + if f.source != "original": + return False + for suffix in [ + "_meta.sqlite", + "_archive.torrent", + "_itemimage.jpg", + "_meta.xml", + "_thumb.png", + "_files.xml", + ]: + if f.name == item_name + suffix or f.name == item_name.lower() + suffix: + return False + if f.name.startswith("_"): + return False + if item_name.startswith("academictorrents_"): + for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]: + if f.name == item_name + suffix: + return False + return True + + +def parse_file(f: dict) -> dict: + """ + Takes an IA API file and turns it in to a fatcat fileset manifest file + """ + assert f.name and f.sha1 and f.md5 + assert f.name is not None + mf = { + "path": f.name, + "size": int(f.size), + "sha1": f.sha1, + "md5": f.md5, + } + # TODO: will disable this hard check eventually and replace with: + # mimetype = FORMAT_TO_MIMETYPE.get(f.format) + mimetype = FORMAT_TO_MIMETYPE[f.format] + if mimetype: + mf["extra"] = dict(mimetype=mimetype) + return mf + + +def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession): + print(f"processing item={item_name} release_id={release_id}", file=sys.stderr) + if release_id.startswith("release_"): + release_id = release_id[9:] + assert len(release_id) == 26 + item = session.get_item(item_name) + assert item.metadata["mediatype"] not in ["collection", "web"] + item_files = item.get_files(on_the_fly=False) + manifest = [parse_file(f) for f in item_files if want_file(f, item_name)] + fileset = { + "manifest": manifest, + "urls": [ + { + "rel": "archive", + "url": f"https://archive.org/download/{item_name}/", + }, + ], + "release_ids": [release_id], + # extra={}, + } + print(json.dumps(fileset)) + return fileset + + +def main(): + session = internetarchive.get_session() + if len(sys.argv) == 3: + item_name = sys.argv[1] + release_id = sys.argv[2] + item_to_fileset(item_name, release_id=release_id, session=session) + else: + for line in sys.stdin: + line = line.strip() + if not line: + continue + fields = line.split("\t") + assert len(fields) == 2 + item_name = fields[0] + release_id = fields[1] + item_to_fileset(item_name, release_id=release_id, session=session) + + +if __name__ == "__main__": + main() diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py new file mode 100755 index 0000000..0b60da3 --- /dev/null +++ b/python/scripts/cdx_collection.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +Fetches and merges all CDX files for a collection. + +Calls metadata API to enumerate all items/files, then fetches and concatanates +them all. Requires the 'internetarchive' library. + +Call with a collection name: + + ./cdx_collection SOME_COLLECTION_NAME +""" + +import os +import shutil +import subprocess +import sys +import tempfile + +import internetarchive as ia +import requests + + +def run(): + + if len(sys.argv) != 2: + print("Expected a single argument (collection name)") + sys.exit(-1) + + collection = sys.argv[1] + + # Check collection name is clean + assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum() + + tempdir = tempfile.mkdtemp() + print("Looking up collection: {}".format(collection)) + + # First fetch list + item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection))) + + if len(item_list) == 0: + print("No items found, bailing") + sys.exit(-1) + + print("Found {} potential items".format(len(item_list))) + status = True + errors = [] + for item in item_list: + item = item["identifier"] + # TODO: error handling + try: + ret = ia.download( + item, + files=[item + ".cdx.gz"], + verbose=True, + destdir=tempdir, + no_directory=True, + retries=1000, + ) + status = ret and status + except requests.exceptions.ReadTimeout as rt: + print(str(rt), file=sys.stderr) + errors.append(rt) + continue + + if errors: + print("## Download Errors", file=sys.stderr) + for e in errors: + print(e, file=sys.stderr) + + # Combine files + print("Merging and re-compressing all CDX files...") + # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir), + subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True) + + # Move and cleanup + shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection)) + + print("Done!") + + +if __name__ == "__main__": + run() diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py index 33c425d..e3bf4f0 100755 --- a/python/scripts/covid2ingestrequest.py +++ b/python/scripts/covid2ingestrequest.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 - """ Transform an unpaywall dump (JSON) into ingest requests. """ -import sys -import json import argparse +import json +import sys + import urlcanon @@ -18,38 +18,44 @@ def canon(s): def transform_cnki(obj): requests = [] - assert obj['cnki_id'] - + assert obj["cnki_id"] requests = [] - requests.append({ - 'base_url': canon(obj['info_url']), - 'ingest_type': 'pdf', - 'link_source': 'cnki_covid19', - 'link_source_id': obj['cnki_id'], - 'ingest_request_source': 'scrape-covid19', - }) - if 'read_url' in obj: - requests.append({ - 'base_url': canon(obj['read_url']), - 'ingest_type': 'pdf', # actually HTML - 'link_source': 'cnki_covid19', - 'link_source_id': obj['cnki_id'], - 'ingest_request_source': 'scrape-covid19', - }) + requests.append( + { + "base_url": canon(obj["info_url"]), + "ingest_type": "pdf", + "link_source": "cnki_covid19", + "link_source_id": obj["cnki_id"], + "ingest_request_source": "scrape-covid19", + } + ) + if "read_url" in obj: + requests.append( + { + "base_url": canon(obj["read_url"]), + "ingest_type": "pdf", # actually HTML + "link_source": "cnki_covid19", + "link_source_id": obj["cnki_id"], + "ingest_request_source": "scrape-covid19", + } + ) return requests + def transform_wanfang(obj): - assert obj['wanfang_id'] - return [{ - 'base_url': canon(obj['url']), - 'ingest_type': 'pdf', - 'link_source': 'wanfang_covid19', - 'link_source_id': obj['wanfang_id'], - 'ingest_request_source': 'scrape-covid19', - }] + assert obj["wanfang_id"] + return [ + { + "base_url": canon(obj["url"]), + "ingest_type": "pdf", + "link_source": "wanfang_covid19", + "link_source_id": obj["wanfang_id"], + "ingest_request_source": "scrape-covid19", + } + ] def run(args): @@ -58,26 +64,27 @@ def run(args): continue row = json.loads(l) - if 'wanfang_id' in row: + if "wanfang_id" in row: requests = transform_wanfang(row) or [] - elif 'cnki_id' in row: + elif "cnki_id" in row: requests = transform_cnki(row) or [] else: continue for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="COVID-19 metadata file to use", - type=argparse.FileType('r')) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py index 86b3b35..27ccf21 100755 --- a/python/scripts/deliver_dumpgrobid_to_s3.py +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -19,23 +19,20 @@ Output: - log to stdout (redirect to file), prefixed by sha1 Requires: -- raven (sentry) +- sentry-sdk - boto3 (AWS S3 client library) """ -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter import boto3 -import raven - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() +import sentry_sdk def b32_hex(s): @@ -45,81 +42,80 @@ def b32_hex(s): s = s[5:] if len(s) != 32: return s - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') - + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") -class DeliverDumpGrobidS3(): +class DeliverDumpGrobidS3: def __init__(self, s3_bucket, **kwargs): self.rstore = None self.count = Counter() self.s3_bucket = s3_bucket - self.s3_prefix = kwargs.get('s3_prefix', 'grobid/') - self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml') - self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD') - self.s3 = boto3.resource('s3') + self.s3_prefix = kwargs.get("s3_prefix", "grobid/") + self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml") + self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD") + self.s3 = boto3.resource("s3") self.bucket = self.s3.Bucket(self.s3_bucket) def run(self, dump_file): sys.stderr.write("Starting...\n") for line in dump_file: - line = line.strip().split('\t') + line = line.strip().split("\t") if len(line) != 2: - self.count['skip-line'] += 1 + self.count["skip-line"] += 1 continue sha1_hex, grobid_json = line[0], line[1] if len(sha1_hex) != 40: sha1_hex = b32_hex(sha1_hex) assert len(sha1_hex) == 40 grobid = json.loads(grobid_json) - tei_xml = grobid.get('tei_xml') + tei_xml = grobid.get("tei_xml") if not tei_xml: print("{}\tskip empty".format(sha1_hex)) - self.count['skip-empty'] += 1 + self.count["skip-empty"] += 1 continue - tei_xml = tei_xml.encode('utf-8') + tei_xml = tei_xml.encode("utf-8") # upload to AWS S3 obj = self.bucket.put_object( - Key="{}{}/{}{}".format( - self.s3_prefix, - sha1_hex[0:4], - sha1_hex, - self.s3_suffix), + Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix), Body=tei_xml, StorageClass=self.s3_storage_class, ) print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml))) - self.count['success-s3'] += 1 + self.count["success-s3"] += 1 sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--s3-bucket', - required=True, - type=str, - help='AWS S3 bucket to upload into') - parser.add_argument('--s3-prefix', - type=str, - default="grobid/", - help='key prefix for items created in bucket') - parser.add_argument('--s3-suffix', - type=str, - default=".tei.xml", - help='file suffix for created objects') - parser.add_argument('--s3-storage-class', - type=str, - default="STANDARD", - help='AWS S3 storage class (redundancy) to use') - parser.add_argument('dump_file', - help="TSV/JSON dump file", - default=sys.stdin, - type=argparse.FileType('r')) + parser.add_argument( + "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into" + ) + parser.add_argument( + "--s3-prefix", + type=str, + default="grobid/", + help="key prefix for items created in bucket", + ) + parser.add_argument( + "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects" + ) + parser.add_argument( + "--s3-storage-class", + type=str, + default="STANDARD", + help="AWS S3 storage class (redundancy) to use", + ) + parser.add_argument( + "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r") + ) args = parser.parse_args() + sentry_sdk.init() + worker = DeliverDumpGrobidS3(**args.__dict__) worker.run(args.dump_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == "__main__": # pragma: no cover main() diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py index 3dcf962..093f32a 100755 --- a/python/scripts/deliver_gwb_to_disk.py +++ b/python/scripts/deliver_gwb_to_disk.py @@ -7,160 +7,191 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk. # in `wayback` library. Means we can't run pylint. # pylint: skip-file -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter +from http.client import IncompleteRead -import raven +import sentry_sdk import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() +from wayback.resourcestore import ResourceStore class DeliverGwbDisk: - def __init__(self, disk_dir, **kwargs): - self.warc_uri_prefix = kwargs.get('warc_uri_prefix') + self.warc_uri_prefix = kwargs.get("warc_uri_prefix") self.rstore = None self.count = Counter() # /serve/ instead of /download/ doesn't record view count - self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/") # gwb library will fall back to reading from /opt/.petabox/webdata.secret - self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) + self.petabox_webdata_secret = kwargs.get( + "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET") + ) self.disk_dir = disk_dir - self.disk_prefix = kwargs.get('disk_prefix', 'pdf/') - self.disk_suffix = kwargs.get('disk_suffix', '.pdf') + self.disk_prefix = kwargs.get("disk_prefix", "pdf/") + self.disk_suffix = kwargs.get("disk_suffix", ".pdf") def fetch_warc_content(self, warc_path, offset, c_size): warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: - self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( - webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + self.rstore = ResourceStore( + loaderfactory=CDXLoaderFactory( + webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url, + ) + ) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)", + ) except ValueError as ve: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})".format( + ve + ), + ) except EOFError as eofe: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})".format( + eofe + ), + ) except TypeError as te: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format( + te + ), + ) # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. if gwb_record.get_status()[0] != 200: - return None, dict(status="error", + return None, dict( + status="error", reason="archived HTTP response (WARC) was not 200", - warc_status=gwb_record.get_status()[0]) + warc_status=gwb_record.get_status()[0], + ) try: raw_content = gwb_record.open_raw_content().read() except IncompleteRead as ire: - return None, dict(status="error", - reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + return None, dict( + status="error", + reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format( + ire + ), + ) return raw_content, None def run(self, manifest_file): sys.stderr.write("Ensuring all 65536 base directories exist...\n") for i in range(256): for j in range(256): - fpath = "{}/{}{:02x}/{:02x}".format( - self.disk_dir, - self.disk_prefix, - i, - j) + fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j) os.makedirs(fpath, exist_ok=True) sys.stderr.write("Starting...\n") for line in manifest_file: - self.count['total'] += 1 - line = line.strip().split('\t') + self.count["total"] += 1 + line = line.strip().split("\t") if len(line) != 2: - self.count['skip-line'] += 1 + self.count["skip-line"] += 1 continue sha1_hex, cdx_json = line[0], line[1] assert len(sha1_hex) == 40 file_cdx = json.loads(cdx_json) # If warc is not item/file.(w)arc.gz form, skip it - if len(file_cdx['warc'].split('/')) != 2: - sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) - print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) - self.count['skip-warc'] += 1 + if len(file_cdx["warc"].split("/")) != 2: + sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"])) + print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"])) + self.count["skip-warc"] += 1 continue # fetch from GWB/petabox via HTTP range-request - blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) + blob, status = self.fetch_warc_content( + file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"] + ) if blob is None and status: - print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) - self.count['err-petabox-fetch'] += 1 + print( + "{}\terror petabox\t{}\t{}".format( + sha1_hex, file_cdx["warc"], status["reason"] + ) + ) + self.count["err-petabox-fetch"] += 1 continue elif not blob: print("{}\tskip-empty-blob".format(sha1_hex)) - self.count['skip-empty-blob'] += 1 + self.count["skip-empty-blob"] += 1 continue # verify sha1 if sha1_hex != hashlib.sha1(blob).hexdigest(): - #assert sha1_hex == hashlib.sha1(blob).hexdigest() - #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) + # assert sha1_hex == hashlib.sha1(blob).hexdigest() + # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) print("{}\terror petabox-hash-mismatch".format(sha1_hex)) - self.count['err-petabox-hash-mismatch'] += 1 + self.count["err-petabox-hash-mismatch"] += 1 - self.count['petabox-ok'] += 1 + self.count["petabox-ok"] += 1 # save to disk fpath = "{}/{}{}/{}/{}{}".format( - self.disk_dir, - self.disk_prefix, - sha1_hex[0:2], - sha1_hex[2:4], - sha1_hex, - self.disk_suffix) - with open(fpath, 'wb') as f: + self.disk_dir, + self.disk_prefix, + sha1_hex[0:2], + sha1_hex[2:4], + sha1_hex, + self.disk_suffix, + ) + with open(fpath, "wb") as f: f.write(blob) print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob))) - self.count['success-disk'] += 1 + self.count["success-disk"] += 1 sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--disk-dir', - required=True, - type=str, - help='local base directory to save into') - parser.add_argument('--disk-prefix', - type=str, - default="pdf/", - help='directory prefix for items created in bucket') - parser.add_argument('--disk-suffix', - type=str, - default=".pdf", - help='file suffix for created files') - parser.add_argument('--warc-uri-prefix', - type=str, - default='https://archive.org/serve/', - help='URI where WARCs can be found') - parser.add_argument('manifest_file', - help="TSV/JSON manifest file", - default=sys.stdin, - type=argparse.FileType('r')) + parser.add_argument( + "--disk-dir", required=True, type=str, help="local base directory to save into" + ) + parser.add_argument( + "--disk-prefix", + type=str, + default="pdf/", + help="directory prefix for items created in bucket", + ) + parser.add_argument( + "--disk-suffix", type=str, default=".pdf", help="file suffix for created files" + ) + parser.add_argument( + "--warc-uri-prefix", + type=str, + default="https://archive.org/serve/", + help="URI where WARCs can be found", + ) + parser.add_argument( + "manifest_file", + help="TSV/JSON manifest file", + default=sys.stdin, + type=argparse.FileType("r"), + ) args = parser.parse_args() + sentry_sdk.init() + worker = DeliverGwbDisk(**args.__dict__) worker.run(args.manifest_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == "__main__": # pragma: no cover main() diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py index 39ac000..6f37ede 100755 --- a/python/scripts/deliver_gwb_to_s3.py +++ b/python/scripts/deliver_gwb_to_s3.py @@ -24,7 +24,7 @@ Output: - log to stdout (redirect to file), prefixed by sha1 Requires: -- raven (sentry) +- sentry-sdk - boto3 (AWS S3 client library) - wayback/GWB libraries """ @@ -33,152 +33,180 @@ Requires: # in `wayback` library. Means we can't run pylint. # pylint: skip-file -import os -import sys -import json +import argparse import base64 import hashlib -import argparse +import json +import os +import sys from collections import Counter +from http.client import IncompleteRead import boto3 -import raven +import sentry_sdk import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() +from wayback.resourcestore import ResourceStore class DeliverGwbS3: - def __init__(self, s3_bucket, **kwargs): - self.warc_uri_prefix = kwargs.get('warc_uri_prefix') + self.warc_uri_prefix = kwargs.get("warc_uri_prefix") self.rstore = None self.count = Counter() # /serve/ instead of /download/ doesn't record view count - self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/") # gwb library will fall back to reading from /opt/.petabox/webdata.secret - self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) + self.petabox_webdata_secret = kwargs.get( + "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET") + ) self.s3_bucket = s3_bucket - self.s3_prefix = kwargs.get('s3_prefix', 'pdf/') - self.s3_suffix = kwargs.get('s3_suffix', '.pdf') - self.s3 = boto3.resource('s3') + self.s3_prefix = kwargs.get("s3_prefix", "pdf/") + self.s3_suffix = kwargs.get("s3_suffix", ".pdf") + self.s3 = boto3.resource("s3") self.bucket = self.s3.Bucket(self.s3_bucket) def fetch_warc_content(self, warc_path, offset, c_size): warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: - self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( - webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + self.rstore = ResourceStore( + loaderfactory=CDXLoaderFactory( + webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url, + ) + ) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)", + ) except ValueError as ve: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})".format( + ve + ), + ) except EOFError as eofe: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})".format( + eofe + ), + ) except TypeError as te: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format( + te + ), + ) # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. if gwb_record.get_status()[0] != 200: - return None, dict(status="error", + return None, dict( + status="error", reason="archived HTTP response (WARC) was not 200", - warc_status=gwb_record.get_status()[0]) + warc_status=gwb_record.get_status()[0], + ) try: raw_content = gwb_record.open_raw_content().read() except IncompleteRead as ire: - return None, dict(status="error", - reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + return None, dict( + status="error", + reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format( + ire + ), + ) return raw_content, None def run(self, manifest_file): sys.stderr.write("Starting...\n") for line in manifest_file: - self.count['total'] += 1 - line = line.strip().split('\t') + self.count["total"] += 1 + line = line.strip().split("\t") if len(line) != 2: - self.count['skip-line'] += 1 + self.count["skip-line"] += 1 continue sha1_hex, cdx_json = line[0], line[1] assert len(sha1_hex) == 40 file_cdx = json.loads(cdx_json) # If warc is not item/file.(w)arc.gz form, skip it - if len(file_cdx['warc'].split('/')) != 2: - sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) - print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) - self.count['skip-warc'] += 1 + if len(file_cdx["warc"].split("/")) != 2: + sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"])) + print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"])) + self.count["skip-warc"] += 1 continue # fetch from GWB/petabox via HTTP range-request - blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) + blob, status = self.fetch_warc_content( + file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"] + ) if blob is None and status: - print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) - self.count['err-petabox-fetch'] += 1 + print( + "{}\terror petabox\t{}\t{}".format( + sha1_hex, file_cdx["warc"], status["reason"] + ) + ) + self.count["err-petabox-fetch"] += 1 continue elif not blob: print("{}\tskip-empty-blob".format(sha1_hex)) - self.count['skip-empty-blob'] += 1 + self.count["skip-empty-blob"] += 1 continue # verify sha1 if sha1_hex != hashlib.sha1(blob).hexdigest(): - #assert sha1_hex == hashlib.sha1(blob).hexdigest() - #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) + # assert sha1_hex == hashlib.sha1(blob).hexdigest() + # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) print("{}\terror petabox-hash-mismatch".format(sha1_hex)) - self.count['err-petabox-hash-mismatch'] += 1 + self.count["err-petabox-hash-mismatch"] += 1 - self.count['petabox-ok'] += 1 + self.count["petabox-ok"] += 1 # upload to AWS S3 obj = self.bucket.put_object( - Key="{}{}/{}{}".format( - self.s3_prefix, - sha1_hex[0:4], - sha1_hex, - self.s3_suffix), - Body=blob) + Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix), + Body=blob, + ) print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob))) - self.count['success-s3'] += 1 + self.count["success-s3"] += 1 sys.stderr.write("{}\n".format(self.count)) -@sentry_client.capture_exceptions + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--s3-bucket', - required=True, - type=str, - help='AWS S3 bucket to upload into') - parser.add_argument('--s3-prefix', - type=str, - default="pdf/", - help='key prefix for items created in bucket') - parser.add_argument('--s3-suffix', - type=str, - default=".pdf", - help='file suffix for created objects') - parser.add_argument('--warc-uri-prefix', - type=str, - default='https://archive.org/serve/', - help='URI where WARCs can be found') - parser.add_argument('manifest_file', - help="TSV/JSON manifest file", - default=sys.stdin, - type=argparse.FileType('r')) + parser.add_argument( + "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into" + ) + parser.add_argument( + "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket" + ) + parser.add_argument( + "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects" + ) + parser.add_argument( + "--warc-uri-prefix", + type=str, + default="https://archive.org/serve/", + help="URI where WARCs can be found", + ) + parser.add_argument( + "manifest_file", + help="TSV/JSON manifest file", + default=sys.stdin, + type=argparse.FileType("r"), + ) args = parser.parse_args() + sentry_sdk.init() + worker = DeliverGwbS3(**args.__dict__) worker.run(args.manifest_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == "__main__": # pragma: no cover main() diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index b981ab6..aef5c12 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Transform an DOAJ article dump (JSON) into ingest requests. @@ -9,31 +8,31 @@ in the HTML headers and adds an ingest request on that basis. Or even just run the re-ingest in-process and publish a second result. """ -import sys -import json import argparse +import json +import sys +from typing import List, Optional + import urlcanon -from typing import Optional, List DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) - # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", "ncbi.nlm.nih.gov/", - #"semanticscholar.org/", + # "semanticscholar.org/", "://doi.org/", + "://dx.doi.org/", "zenodo.org/", "figshare.com/", "://archive.org/", ".archive.org/", - # large publishers/platforms; may remove in the future - #"://link.springer.com/", - #"://dergipark.gov.tr/", - #"frontiersin.org/", - #"scielo", + # "://link.springer.com/", + # "://dergipark.gov.tr/", + # "frontiersin.org/", + # "scielo", ] # these default to PDF; note that we also do pdf ingests for HTML pages @@ -41,78 +40,83 @@ CONTENT_TYPE_MAP = { "abstract": [], "doc": [], "": ["pdf"], - "doi": ["pdf"], "url": ["pdf"], "fulltext": ["pdf"], "anySimpleType": ["pdf"], - "application/pdf": ["pdf"], "html": ["html", "pdf"], "text/html": ["html", "pdf"], "xml": ["xml"], } + def canon(s: str) -> str: parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) + def transform(obj: dict) -> List[dict]: """ Transforms from a single DOAJ object to zero or more ingest requests. Returns a list of dicts. """ - doaj_id = obj['id'].lower() + doaj_id = obj["id"].lower() assert doaj_id - bibjson = obj['bibjson'] - if not bibjson['link']: + bibjson = obj["bibjson"] + if not bibjson["link"]: return [] requests = [] doi: Optional[str] = None - for ident in (bibjson['identifier'] or []): - if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'): - doi = ident['id'].lower() + for ident in bibjson["identifier"] or []: + if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."): + doi = ident["id"].lower() - for link in (bibjson['link'] or []): - if link.get('type') != "fulltext" or not link.get('url'): + for link in bibjson["link"] or []: + if link.get("type") != "fulltext" or not link.get("url"): continue - ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower()) + ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower()) if not ingest_types: continue + skip = False for domain in DOMAIN_BLOCKLIST: - if domain in link['url'].lower(): + if domain in link["url"].lower(): skip = True if skip: continue try: - base_url = canon(link['url']) + base_url = canon(link["url"].strip()) except UnicodeEncodeError: continue + if not base_url or len(base_url) > 1000: + continue + for ingest_type in ingest_types: request = { - 'base_url': base_url, - 'ingest_type': ingest_type, - 'link_source': 'doaj', - 'link_source_id': doaj_id, - 'ingest_request_source': 'doaj', - 'release_stage': 'published', - 'rel': 'publisher', - 'ext_ids': { - 'doi': doi, - 'doaj': doaj_id, + "base_url": base_url, + "ingest_type": ingest_type, + "link_source": "doaj", + "link_source_id": doaj_id, + "ingest_request_source": "doaj", + "release_stage": "published", + "rel": "publisher", + "ext_ids": { + "doi": doi, + "doaj": doaj_id, }, - 'edit_extra': {}, + "edit_extra": {}, } requests.append(request) return requests + def run(args) -> None: for l in args.json_file: if not l.strip(): @@ -123,17 +127,18 @@ def run(args) -> None: for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main() -> None: - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="DOAJ article dump file to use", - type=argparse.FileType('r')) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py index 9fe1499..44c091c 100755 --- a/python/scripts/enrich_scored_matches.py +++ b/python/scripts/enrich_scored_matches.py @@ -17,29 +17,32 @@ And outputs JSON objects that are can be imported into fatcat with the No dependencies (only python3 stdlib) """ -import sys -import json import base64 +import json +import sys + def run(): for line in sys.stdin: - line = line.split('\t') + line = line.split("\t") assert len(line) == 5 - raw_sha1 = line[0].replace('sha1:', '') + raw_sha1 = line[0].replace("sha1:", "") dois = json.loads(line[1]) cdx = json.loads(line[2]) mimetype = line[3] size = int(line[4]) - sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower() + sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower() obj = dict( sha1=sha1, dois=dois, - cdx=[dict(url=cdx['url'], dt=cdx['dt'])], + cdx=[dict(url=cdx["url"], dt=cdx["dt"])], size=size, - mimetype=mimetype) + mimetype=mimetype, + ) print(json.dumps(obj)) -if __name__=='__main__': + +if __name__ == "__main__": run() diff --git a/python/scripts/fetch_cdx_sha1hex.py b/python/scripts/fetch_cdx_sha1hex.py new file mode 100755 index 0000000..2eb56cb --- /dev/null +++ b/python/scripts/fetch_cdx_sha1hex.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +""" +This is a helper script to take fatcat file entities with partial metadata (eg, +missing SHA256) and try to find one or more CDX record where the file may be +found in wayback. + +This script uses the sandcrawler library and should be run like: + + head file_export.json | python -m scripts.fetch_cdx_sha1hex > results.json +""" + +import base64 +import json +import sys +from typing import List, Optional + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error + +from sandcrawler.ia import CdxApiClient, cdx_to_dict + + +def requests_retry_session( + retries: int = 10, + backoff_factor: int = 3, + status_forcelist: List[int] = [500, 502, 504], + session: requests.Session = None, +) -> requests.Session: + """ + From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests + """ + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + + +def b32_hex(s: str) -> str: + """ + Converts a base32-encoded SHA-1 checksum into hex-encoded + + base32 checksums are used by, eg, heritrix and in wayback CDX files + """ + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + if len(s) == 40: + return s + raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") + + +SANDCRAWLER_POSTGREST_URL = "http://wbgrp-svc506.us.archive.org:3030" + + +def get_db_cdx(sha1hex: str, http_session) -> List[dict]: + resp = http_session.get( + SANDCRAWLER_POSTGREST_URL + "/cdx", params=dict(sha1hex="eq." + sha1hex) + ) + resp.raise_for_status() + rows = resp.json() + return rows or [] + + +CDX_API_URL = "https://web.archive.org/cdx/search/cdx" + + +def get_api_cdx(url: str, sha1hex: str, cdx_api) -> Optional[dict]: + + params = { + "url": url, + "output": "json", + "matchType": "exact", + "limit": 20, + # TODO: group-by digest/checksum? + # can't filter status because might be warc/revisit + # "filter": "statuscode:200", + } + rows = cdx_api._query_api(params) + if not rows: + return None + for row in rows: + if row.sha1hex == sha1hex: + return row + return None + + +def process_file(fe, session, cdx_api) -> dict: + status = "unknown" + + # simple CDX db lookup first + cdx_row_list = get_db_cdx(fe["sha1"], http_session=session) + if cdx_row_list: + return dict( + file_entity=fe, + cdx_rows=cdx_row_list, + status="success-db", + ) + + original_urls = [] + for pair in fe["urls"]: + u = pair["url"] + if not "://web.archive.org/web/" in u: + continue + seg = u.split("/") + assert seg[2] == "web.archive.org" + assert seg[3] == "web" + if not seg[4].isdigit(): + continue + original_url = "/".join(seg[5:]) + original_urls.append(original_url) + + if len(original_urls) == 0: + return dict(file_entity=fe, status="skip-no-urls") + + found_cdx_rows = [] + for url in list(set(original_urls)): + + cdx_record = None + try: + cdx_record = get_api_cdx(original_url, sha1hex=fe["sha1"], cdx_api=cdx_api) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 403: + return dict(file_entity=fe, status="fail-cdx-403") + else: + raise + if cdx_record and cdx_record.sha1hex == fe["sha1"]: + found_cdx_rows.append(cdx_to_dict(cdx_record)) + + if found_cdx_rows: + return dict( + file_entity=fe, + cdx_rows=found_cdx_rows, + status="success-api", + ) + + return dict( + file_entity=fe, + status="fail-not-found", + ) + + +def main(): + session = requests_retry_session() + session.headers.update( + { + "User-Agent": "Mozilla/5.0 fatcat.CdxFixupBot", + } + ) + cdx_api = CdxApiClient() + for line in sys.stdin: + if not line.strip(): + continue + fe = json.loads(line) + print(json.dumps(process_file(fe, session=session, cdx_api=cdx_api))) + + +if __name__ == "__main__": + main() diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index dc4bea7..8fce0d9 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -1,44 +1,49 @@ #!/usr/bin/env python3 -import sys import json +import sys -with open('title_slug_denylist.txt', 'r') as f: +with open("title_slug_denylist.txt", "r") as f: TITLE_DENYLIST = [l.strip() for l in f] -TITLE_DENYLIST.extend(( - 'editorial', - 'advertisement', - 'bookreviews', - 'reviews', - 'nr', - 'abstractoriginalarticle', - 'originalarticle', - 'impactfactor', - 'articlenumber', -)) +TITLE_DENYLIST.extend( + ( + "editorial", + "advertisement", + "bookreviews", + "reviews", + "nr", + "abstractoriginalarticle", + "originalarticle", + "impactfactor", + "articlenumber", + ) +) # The full name can't *entirely* be one of these NAME_DENYLIST = ( - 'phd', - 'phdstudent', + "phd", + "phdstudent", ) + def tokenize(s, remove_whitespace=True): - s.replace(''', "'") + s.replace("'", "'") # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()]) + s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()]) if remove_whitespace: - s = ''.join(s.split()) + s = "".join(s.split()) # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').decode('utf8').replace('?', '') + return s.encode("ascii", "replace").decode("utf8").replace("?", "") + assert tokenize("Impact Factor: 2.114") == "impactfactor" assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST + def filter_title(title): title = title.strip() @@ -47,14 +52,14 @@ def filter_title(title): title_slug = tokenize(title, remove_whitespace=True) if len(title_slug) < 10 or title_slug in TITLE_DENYLIST: return None - if title_slug.startswith('nr'): + if title_slug.startswith("nr"): return None - if title.lower().replace('.', '').startswith('int j '): + if title.lower().replace(".", "").startswith("int j "): return None for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "): if title.startswith(prefix): - title.replace(prefix, '') + title.replace(prefix, "") if title.startswith("The Journal of "): return None @@ -78,63 +83,84 @@ def filter_title(title): return None # too deep subtitling/splitting - if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1: + if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1: return None return title + def filter_author_name(name): - name = name['name'] - if name.strip().lower().replace(' ', '') in NAME_DENYLIST: + name = name["name"] + if name.strip().lower().replace(" ", "") in NAME_DENYLIST: return None - return ' '.join([t for t in name.split() if tokenize(t)]) + return " ".join([t for t in name.split() if tokenize(t)]) + def filter_authors(l): return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1] + def filter_refs(l): # TODO: return l + def filter_journal_name(name): # same denylist, for now if not name: return None - name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') + name = name.replace(" e-ISSN", "").replace(" p-ISSN", "") slug_name = tokenize(name) if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º": return None - for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): + for prefix in ( + "/ ", + "~ ", + "& ", + "© ", + "Original Research Article ", + "Original Article ", + "Research Article ", + "Available online www.jocpr.com ", + ): if name.startswith(prefix): - name = name.replace(prefix, '') - for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"): + name = name.replace(prefix, "") + for suffix in ( + " Available online at www.sciarena.com", + " Original Article", + " Available online at", + " ISSN", + " ISSUE", + ): if name.endswith(suffix): - name = name.replace(suffix, '') + name = name.replace(suffix, "") if "====================" in name: return None if len(name) > 150: return None - return ' '.join(name.split()) + return " ".join(name.split()) + def filter_metadata(obj): - if not (obj.get('title') and obj.get('authors')): + if not (obj.get("title") and obj.get("authors")): return None - title = filter_title(obj['title']) + title = filter_title(obj["title"]) if not title: - #sys.stderr.write("bad title\n") + # sys.stderr.write("bad title\n") return None else: - obj['title'] = title - obj['authors'] = filter_authors(obj['authors']) - obj['citations'] = filter_refs(obj['citations']) - obj['journal']['name'] = filter_journal_name(obj['journal']['name']) + obj["title"] = title + obj["authors"] = filter_authors(obj["authors"]) + obj["citations"] = filter_refs(obj["citations"]) + obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"]) return obj + def run(invert=False): for line in sys.stdin: - fields = line.split('\t') + fields = line.split("\t") if len(fields) == 5: raw = fields[4] elif len(fields) == 1: @@ -151,9 +177,10 @@ def run(invert=False): fields[4] = processed else: fields[0] = processed - print('\t'.join(fields)) + print("\t".join(fields)) elif invert: print(raw.strip()) -if __name__=="__main__": + +if __name__ == "__main__": run(invert="--invert" in sys.argv) diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py index bbba770..87dae16 100755 --- a/python/scripts/filter_groupworks.py +++ b/python/scripts/filter_groupworks.py @@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out: - dates differ (not just year) """ -import sys import json +import sys # out of 1000 SCORE_THRESHOLD = 900 @@ -28,17 +28,19 @@ MAX_SLUG_LINES = 50 REQUIRE_AUTHORS = False + def tokenize(s, remove_whitespace=False): - s.replace(''', "'") + s.replace("'", "'") # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) + s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()]) if remove_whitespace: - s = ''.join(s.split()) + s = "".join(s.split()) # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').replace(b'?', b'') + return s.encode("ascii", "replace").replace(b"?", b"") + def check_authors(left, right): """ @@ -51,7 +53,7 @@ def check_authors(left, right): return False right_all = tokenize(" ".join(right)) for i in range(len(left)): - l = left[i].lower().replace('jr.', '').split() + l = left[i].lower().replace("jr.", "").split() if not l: return False l = tokenize(l[-1]) @@ -59,20 +61,22 @@ def check_authors(left, right): # weird author name (single char) return False if l not in right_all: - #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) + # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) return False return True + def test_check_authors(): assert check_authors([], []) == bool(not REQUIRE_AUTHORS) - assert not check_authors([], ['one']) - assert check_authors(['one'], ['one']) - assert check_authors(['one two'], ['One Two']) - assert check_authors(['two'], ['One Two']) - assert check_authors(['two'], ['two, one']) - assert check_authors(['mago'], ['Mr. Magoo']) - assert check_authors(['Mr. Magoo'], ['Mr Magoo']) - assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + assert not check_authors([], ["one"]) + assert check_authors(["one"], ["one"]) + assert check_authors(["one two"], ["One Two"]) + assert check_authors(["two"], ["One Two"]) + assert check_authors(["two"], ["two, one"]) + assert check_authors(["mago"], ["Mr. Magoo"]) + assert check_authors(["Mr. Magoo"], ["Mr Magoo"]) + assert check_authors(["one", "tw", "thr"], ["one", "two", "three"]) + # Rows are (score, left, right) def process_group(rows): @@ -86,10 +90,10 @@ def process_group(rows): left = json.loads(row[1]) right = json.loads(row[2]) # authors must roughly match - if not check_authors(left['authors'], right['authors']): + if not check_authors(left["authors"], right["authors"]): continue # years must match (if defined) - if left['year'] and right['year'] and left['year'] != right['year']: + if left["year"] and right["year"] and left["year"] != right["year"]: continue filtered.append((left, right)) @@ -101,8 +105,8 @@ def process_group(rows): group_ids = set() for row in filtered[1:]: (left, right) = row - l_id = left['fatcat_release'] - r_id = right['fatcat_release'] + l_id = left["fatcat_release"] + r_id = right["fatcat_release"] releases[l_id] = left releases[r_id] = right if not group_ids: @@ -119,6 +123,7 @@ def process_group(rows): print(json.dumps([releases[ident] for ident in group_ids])) + def run(): last_slug = None @@ -126,7 +131,7 @@ def run(): # group lines by slug, and process in batches for line in sys.stdin: - line = line.strip().split('\t') + line = line.strip().split("\t") assert len(line) == 4 slug = line[0] if last_slug and slug != last_slug and lines: @@ -140,5 +145,6 @@ def run(): if lines: process_group(lines) -if __name__=='__main__': + +if __name__ == "__main__": run() diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py index 3654b87..c5b7eef 100755 --- a/python/scripts/filter_scored_matches.py +++ b/python/scripts/filter_scored_matches.py @@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file). No dependencies (only python3 stdlib) """ -import sys import json +import sys # out of 1000 score_threshold = 900 @@ -23,15 +23,16 @@ require_authors = 1 def tokenize(s, remove_whitespace=False): - s.replace(''', "'") + s.replace("'", "'") # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) + s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()]) if remove_whitespace: - s = ''.join(s.split()) + s = "".join(s.split()) # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').replace(b'?', b'') + return s.encode("ascii", "replace").replace(b"?", b"") + def check_authors(left, right): """ @@ -44,7 +45,7 @@ def check_authors(left, right): return False right_all = tokenize(" ".join(right)) for i in range(len(left)): - l = left[i].lower().replace('jr.', '').split() + l = left[i].lower().replace("jr.", "").split() if not l: return False l = tokenize(l[-1]) @@ -52,20 +53,22 @@ def check_authors(left, right): # weird author name (single char) return False if l not in right_all: - #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) + # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) return False return True + def test_check_authors(): assert not check_authors([], []) - assert not check_authors([], ['one']) - assert check_authors(['one'], ['one']) - assert check_authors(['one two'], ['One Two']) - assert check_authors(['two'], ['One Two']) - assert check_authors(['two'], ['two, one']) - assert check_authors(['mago'], ['Mr. Magoo']) - assert check_authors(['Mr. Magoo'], ['Mr Magoo']) - assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + assert not check_authors([], ["one"]) + assert check_authors(["one"], ["one"]) + assert check_authors(["one two"], ["One Two"]) + assert check_authors(["two"], ["One Two"]) + assert check_authors(["two"], ["two, one"]) + assert check_authors(["mago"], ["Mr. Magoo"]) + assert check_authors(["Mr. Magoo"], ["Mr Magoo"]) + assert check_authors(["one", "tw", "thr"], ["one", "two", "three"]) + # Rows are (score, grobid, crossref) def process_group(rows): @@ -78,20 +81,21 @@ def process_group(rows): continue grobid = json.loads(row[1]) crossref = json.loads(row[2]) - if not check_authors(crossref['authors'], grobid['authors']): - #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) + if not check_authors(crossref["authors"], grobid["authors"]): + # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) continue else: - #print("YES: {} {}".format(crossref['authors'], grobid['authors'])) + # print("YES: {} {}".format(crossref['authors'], grobid['authors'])) pass - sha1 = grobid['sha1'] - doi = crossref['doi'].lower() + sha1 = grobid["sha1"] + doi = crossref["doi"].lower() l = keepers.get(sha1, list()) l.append(doi) keepers[sha1] = l for sha1, doi_list in keepers.items(): print("{}\t{}".format(sha1, json.dumps(doi_list))) + def run(): last_slug = None @@ -99,7 +103,7 @@ def run(): # group lines by slug, and process in batches for line in sys.stdin: - line = line.strip().split('\t') + line = line.strip().split("\t") assert len(line) == 4 slug = line[0] if last_slug and slug != last_slug and lines: @@ -112,5 +116,6 @@ def run(): if lines: process_group(lines) -if __name__=='__main__': + +if __name__ == "__main__": run() diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index 79feac1..90a0f77 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction output, converts the XML to JSON, filters out raw affiliation strings, and @@ -10,43 +9,49 @@ Run in bulk like: ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations' """ -import sys import json +import sys + +from grobid_tei_xml import parse_document_xml -from grobid2json import teixml2json def parse_hbase(line): - line = line.split('\t') + line = line.split("\t") assert len(line) == 2 sha1hex = line[0] obj = json.loads(line[1]) - tei_xml = obj['tei_xml'] + tei_xml = obj["tei_xml"] return sha1hex, tei_xml + def parse_pg(line): obj = json.loads(line) - return obj['sha1hex'], obj['tei_xml'] + return obj["sha1hex"], obj["tei_xml"] + -def run(mode='hbase'): +def run(mode="hbase"): for line in sys.stdin: - if mode == 'hbase': + if mode == "hbase": sha1hex, tei_xml = parse_hbase(line) - elif mode == 'pg': + elif mode == "pg": sha1hex, tei_xml = parse_pg(line) else: - raise NotImplementedError('parse mode: {}'.format(mode)) + raise NotImplementedError("parse mode: {}".format(mode)) - obj = teixml2json(tei_xml, encumbered=False) + tei_doc = parse_document_xml(tei_xml) + tei_doc.remove_encumbered() + obj = tei_doc.to_legacy_dict() affiliations = [] - for author in obj['authors']: - if author.get('affiliation'): - affiliations.append(author['affiliation']) + for author in obj["authors"]: + if author.get("affiliation"): + affiliations.append(author["affiliation"]) if affiliations: # don't duplicate affiliations; only the unique ones affiliations = list(set([json.dumps(a) for a in affiliations])) affiliations = [json.loads(a) for a in affiliations] - print('\t'.join([sha1hex, json.dumps(affiliations)])) + print("\t".join([sha1hex, json.dumps(affiliations)])) + -if __name__=='__main__': +if __name__ == "__main__": run() diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py index 3d2e14c..f941881 100755 --- a/python/scripts/import_grobid_metadata.py +++ b/python/scripts/import_grobid_metadata.py @@ -1,69 +1,67 @@ #!/usr/bin/env python3 -import sys -import json import datetime +import json +import sys + +MAX_ABSTRACT_BYTES = 4096 -MAX_ABSTRACT_BYTES=4096 def parse_grobid_json(obj): - if not obj.get('title'): + if not obj.get("title"): return None extra = dict() - if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: - abobj = dict( - mimetype="text/plain", - language=None, - content=obj.get('abstract').strip()) + if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES: + abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip()) abstracts = [abobj] else: abstracts = None contribs = [] - for a in obj.get('authors', []): + for a in obj.get("authors", []): c = dict(raw_name=a, role="author") contribs.append(c) refs = [] - for raw in obj.get('citations', []): + for raw in obj.get("citations", []): extra = dict() ref = dict() - ref['key'] = raw.get('id') - if raw.get('title'): - ref['title'] = raw['title'].strip() - if raw.get('date'): + ref["key"] = raw.get("id") + if raw.get("title"): + ref["title"] = raw["title"].strip() + if raw.get("date"): try: - year = int(raw['date'].strip()[:4]) - ref['year'] = year + year = int(raw["date"].strip()[:4]) + ref["year"] = year except: pass - for key in ('volume', 'url', 'issue', 'publisher'): + for key in ("volume", "url", "issue", "publisher"): if raw.get(key): extra[key] = raw[key].strip() - if raw.get('authors'): - extra['authors'] = [a['name'] for a in raw['authors']] + if raw.get("authors"): + extra["authors"] = [a["name"] for a in raw["authors"]] if extra: extra = dict(grobid=extra) else: extra = None - ref['extra'] = extra + ref["extra"] = extra refs.append(ref) release_type = "journal-article" release_date = None - if obj.get('date'): + if obj.get("date"): # TODO: only returns year, ever? how to handle? - release_date = datetime.datetime(year=obj['date'], month=1, day=1) + release_date = datetime.datetime(year=obj["date"], month=1, day=1) - if obj.get('doi'): - extra['doi'] = obj['doi'] - if obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] + if obj.get("doi"): + extra["doi"] = obj["doi"].lower() + if obj["journal"].get("name"): + extra["container_name"] = obj["journal"]["name"] - extra['is_longtail_oa'] = True + extra["is_longtail_oa"] = True # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -73,15 +71,17 @@ def parse_grobid_json(obj): extra = None return dict( - title=obj['title'].strip(), + title=obj["title"].strip(), contribs=contribs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), + publisher=obj["journal"].get("publisher"), + volume=obj["journal"].get("volume"), + issue=obj["journal"].get("issue"), abstracts=abstracts, release_type=release_type, release_date=release_date, - extra=extra) + extra=extra, + ) + def run(): for line in sys.stdin: @@ -90,5 +90,6 @@ def run(): if out: print(out) -if __name__=="__main__": + +if __name__ == "__main__": run() diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index 494ec7a..8a353ca 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ This script is used to turn ingest request postgres rows (in JSON export format) back in to regular ingest request JSON. @@ -7,24 +6,25 @@ format) back in to regular ingest request JSON. The only difference is the name and location of some optional keys. """ -import sys -import json import argparse +import json +import sys def transform(row): """ dict-to-dict """ - row.pop('created', None) - extra = row.pop('request', None) or {} - for k in ('ext_ids', 'edit_extra'): + row.pop("created", None) + extra = row.pop("request", None) or {} + for k in ("ext_ids", "edit_extra"): if k in extra: row[k] = extra[k] - if 'release_ident' in extra: - row['fatcat'] = dict(release_ident=extra['release_ident']) + if "release_ident" in extra: + row["fatcat"] = dict(release_ident=extra["release_ident"]) return row + def run(args): for l in args.json_file: if not l.strip(): @@ -33,19 +33,27 @@ def run(args): req = transform(json.loads(l)) except: print(l, file=sys.stderr) + if args.force_recrawl: + req["force_recrawl"] = True print(json.dumps(req, sort_keys=True)) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="arabesque output file to use", - type=argparse.FileType('r')) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="SQL output JSON file to process", type=argparse.FileType("r") + ) + parser.add_argument( + "--force-recrawl", + action="store_true", + help="whether to add recrawl (SPNv2) flag to request", + ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py index 35cee5b..24e22fd 100755 --- a/python/scripts/manifest_converter.py +++ b/python/scripts/manifest_converter.py @@ -10,9 +10,9 @@ This was used to convert this manifest: to JSON format for fast fatcat importing. """ -import sys import json import sqlite3 +import sys # iterate over rows in files metadata... # 1. select all identified DOIs @@ -20,6 +20,7 @@ import sqlite3 # 2. select all file metadata # 3. output object + def or_none(s): if s is None: return None @@ -27,6 +28,7 @@ def or_none(s): return None return s + def process_db(db_path): db = sqlite3.connect(db_path) @@ -52,5 +54,6 @@ def process_db(db_path): dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]) print(json.dumps(obj)) -if __name__=="__main__": + +if __name__ == "__main__": process_db(sys.argv[1]) diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py index 916f41c..97c38f9 100755 --- a/python/scripts/oai2ingestrequest.py +++ b/python/scripts/oai2ingestrequest.py @@ -1,19 +1,18 @@ #!/usr/bin/env python3 - """ Transform an OAI-PMH bulk dump (JSON) into ingest requests. Eg: https://archive.org/details/oai_harvest_20200215 """ -import sys -import json import argparse +import json +import sys + import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) - # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", @@ -26,23 +25,54 @@ DOMAIN_BLOCKLIST = [ "://archive.org/", ".archive.org/", "://127.0.0.1/", - + "://www.kb.dk/", + "://kb-images.kb.dk/", + "://mdz-nbn-resolving.de/", + "://aggr.ukm.um.si/", + "://edoc.mpg.de/", + "doaj.org/", + "orcid.org/", + "://gateway.isiknowledge.com/", # OAI specific additions "://hdl.handle.net/", ] +# OAI identifier prefixes for repositories that we want to skip (for various reasons) +OAI_BLOCKLIST = [ + "oai:kb.dk:", + "oai:bdr.oai.bsb-muenchen.de:", + "oai:hispana.mcu.es:", + "oai:bnf.fr:", + "oai:ukm.si:", + "oai:biodiversitylibrary.org:", + "oai:hsp.org:", + "oai:repec:", + "oai:n/a:", + "oai:quod.lib.umich.edu:", + "oai:americanae.aecid.es:", + "oai:www.irgrid.ac.cn:", + "oai:espace.library.uq.edu:", + "oai:edoc.mpg.de:", + "oai:bibliotecadigital.jcyl.es:", + "oai:repository.erciyes.edu.tr:", + "oai:krm.or.kr:", + "oai:hypotheses.org:%", +] + RELEASE_STAGE_MAP = { - 'info:eu-repo/semantics/draftVersion': 'draft', - 'info:eu-repo/semantics/submittedVersion': 'submitted', - 'info:eu-repo/semantics/acceptedVersion': 'accepted', - 'info:eu-repo/semantics/publishedVersion': 'published', - 'info:eu-repo/semantics/updatedVersion': 'updated', + "info:eu-repo/semantics/draftVersion": "draft", + "info:eu-repo/semantics/submittedVersion": "submitted", + "info:eu-repo/semantics/acceptedVersion": "accepted", + "info:eu-repo/semantics/publishedVersion": "published", + "info:eu-repo/semantics/updatedVersion": "updated", } + def canon(s): parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) + def transform(obj): """ Transforms from a single OAI-PMH object to zero or more ingest requests. @@ -50,38 +80,43 @@ def transform(obj): """ requests = [] - if not obj.get('oai') or not obj['oai'].startswith('oai:'): + if not obj.get("oai") or not obj["oai"].startswith("oai:"): return [] - if not obj.get('urls'): + if not obj.get("urls"): return [] + oai_id = obj["oai"].lower() + for prefix in OAI_BLOCKLIST: + if oai_id.startswith(prefix): + return [] + # look in obj['formats'] for PDF? - if obj.get('formats'): + if obj.get("formats"): # if there is a list of formats, and it does not contain PDF, then # skip. Note that we will continue if there is no formats list. has_pdf = False - for f in obj['formats']: - if 'pdf' in f.lower(): + for f in obj["formats"]: + if "pdf" in f.lower(): has_pdf = True if not has_pdf: return [] doi = None - if obj.get('doi'): - doi = obj['doi'][0].lower().strip() - if not doi.startswith('10.'): + if obj.get("doi"): + doi = obj["doi"][0].lower().strip() + if not doi.startswith("10."): doi = None # infer release stage and/or type from obj['types'] release_stage = None - for t in obj.get('types', []): + for t in obj.get("types", []): if t in RELEASE_STAGE_MAP: release_stage = RELEASE_STAGE_MAP[t] # TODO: infer rel somehow? Eg, repository vs. OJS publisher rel = None - for url in obj['urls']: + for url in obj["urls"]: skip = False for domain in DOMAIN_BLOCKLIST: if domain in url: @@ -94,23 +129,25 @@ def transform(obj): continue request = { - 'base_url': base_url, - 'ingest_type': 'pdf', - 'link_source': 'oai', - 'link_source_id': obj['oai'].lower(), - 'ingest_request_source': 'metha-bulk', - 'release_stage': release_stage, - 'rel': rel, - 'ext_ids': { - 'doi': doi, - 'oai': obj['oai'].lower(), + "base_url": base_url, + "ingest_type": "pdf", + "link_source": "oai", + "link_source_id": oai_id, + "ingest_request_source": "metha-bulk", + "release_stage": release_stage, + "rel": rel, + "ext_ids": { + "oai": obj["oai"].lower(), }, - 'edit_extra': {}, + "edit_extra": {}, } + if doi: + request["ext_ids"]["doi"] = doi requests.append(request) return requests + def run(args): for l in args.json_file: if not l.strip(): @@ -121,17 +158,20 @@ def run(args): for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="OAI-PMH dump file to use (usually stdin)", - type=argparse.FileType('r')) + type=argparse.FileType("r"), + ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py index af08db6..8b57c5b 100755 --- a/python/scripts/pdf_thumbnail.py +++ b/python/scripts/pdf_thumbnail.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc). @@ -7,6 +6,7 @@ Originally used to benchmark and compare file size/quality. """ import sys + import poppler from PIL import Image @@ -22,13 +22,16 @@ def run(inpath, outpath): renderer = poppler.PageRenderer() full_page = renderer.render_page(page) - img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1) - img.thumbnail((180,300), Image.BICUBIC) - #img.thumbnail((360,600), Image.BICUBIC) + img = Image.frombuffer( + "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1 + ) + img.thumbnail((180, 300), Image.BICUBIC) + # img.thumbnail((360,600), Image.BICUBIC) img.save(outpath) - #img.save(outpath, quality=95) + # img.save(outpath, quality=95) + -if __name__ == '__main__': +if __name__ == "__main__": if len(sys.argv) != 3: print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr) sys.exit(-1) diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index 5536e6c..cb64a1a 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -1,41 +1,39 @@ #!/usr/bin/env python3 - """ Transform an unpaywall dump (JSON) into ingest requests. """ -import sys -import json import argparse +import json +import sys + import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) - # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", "ncbi.nlm.nih.gov/", - "semanticscholar.org/", "://doi.org/", "zenodo.org/", "figshare.com/", - "://archive.org/", - ".archive.org/", ] RELEASE_STAGE_MAP = { - 'draftVersion': 'draft', - 'submittedVersion': 'submitted', - 'acceptedVersion': 'accepted', - 'publishedVersion': 'published', - 'updatedVersion': 'updated', + "draftVersion": "draft", + "submittedVersion": "submitted", + "acceptedVersion": "accepted", + "publishedVersion": "published", + "updatedVersion": "updated", } + def canon(s): parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) + def transform(obj): """ Transforms from a single unpaywall object to zero or more ingest requests. @@ -43,48 +41,49 @@ def transform(obj): """ requests = [] - if not obj['doi'].startswith('10.'): + if not obj["doi"].startswith("10."): return requests - if not obj['oa_locations']: + if not obj["oa_locations"]: return requests - for location in obj['oa_locations']: - if not location['url_for_pdf']: + for location in obj["oa_locations"]: + if not location["url_for_pdf"]: continue skip = False for domain in DOMAIN_BLOCKLIST: - if domain in location['url_for_pdf']: + if domain in location["url_for_pdf"]: skip = True if skip: continue try: - base_url = canon(location['url_for_pdf']) + base_url = canon(location["url_for_pdf"]) except UnicodeEncodeError: continue request = { - 'base_url': base_url, - 'ingest_type': 'pdf', - 'link_source': 'unpaywall', - 'link_source_id': obj['doi'].lower(), - 'ingest_request_source': 'unpaywall', - 'release_stage': RELEASE_STAGE_MAP.get(location['version']), - 'rel': location['host_type'], - 'ext_ids': { - 'doi': obj['doi'].lower(), + "base_url": base_url, + "ingest_type": "pdf", + "link_source": "unpaywall", + "link_source_id": obj["doi"].lower(), + "ingest_request_source": "unpaywall", + "release_stage": RELEASE_STAGE_MAP.get(location["version"]), + "rel": location["host_type"], + "ext_ids": { + "doi": obj["doi"].lower(), }, - 'edit_extra': {}, + "edit_extra": {}, } - if obj.get('oa_status'): - request['edit_extra']['oa_status'] = obj['oa_status'] - if location.get('evidence'): - request['edit_extra']['evidence'] = location['evidence'] - if location['pmh_id']: - request['ext_ids']['pmh_id'] = location['pmh_id'] + if obj.get("oa_status"): + request["edit_extra"]["oa_status"] = obj["oa_status"] + if location.get("evidence"): + request["edit_extra"]["evidence"] = location["evidence"] + if location["pmh_id"]: + request["ext_ids"]["pmh_id"] = location["pmh_id"] requests.append(request) return requests + def run(args): for l in args.json_file: if not l.strip(): @@ -95,17 +94,18 @@ def run(args): for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="unpaywall dump file to use", - type=argparse.FileType('r')) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="unpaywall dump file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() |