From 716483103dd7fdfe7aab2982c51abae6d3f4271b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 25 Sep 2019 18:01:29 -0700 Subject: move a bunch of random old scripts to subdir --- python/deliver_dumpgrobid_to_s3.py | 124 ------------------- python/deliver_gwb_to_disk.py | 166 -------------------------- python/deliver_gwb_to_s3.py | 184 ----------------------------- python/enrich_scored_matches.py | 45 ------- python/filter_grobid_metadata.py | 159 ------------------------- python/filter_groupworks.py | 144 ---------------------- python/filter_scored_matches.py | 116 ------------------ python/import_grobid_metadata.py | 94 --------------- python/manifest_converter.py | 56 --------- python/scripts/deliver_dumpgrobid_to_s3.py | 124 +++++++++++++++++++ python/scripts/deliver_gwb_to_disk.py | 166 ++++++++++++++++++++++++++ python/scripts/deliver_gwb_to_s3.py | 184 +++++++++++++++++++++++++++++ python/scripts/enrich_scored_matches.py | 45 +++++++ python/scripts/filter_grobid_metadata.py | 159 +++++++++++++++++++++++++ python/scripts/filter_groupworks.py | 144 ++++++++++++++++++++++ python/scripts/filter_scored_matches.py | 116 ++++++++++++++++++ python/scripts/import_grobid_metadata.py | 94 +++++++++++++++ python/scripts/manifest_converter.py | 56 +++++++++ 18 files changed, 1088 insertions(+), 1088 deletions(-) delete mode 100755 python/deliver_dumpgrobid_to_s3.py delete mode 100755 python/deliver_gwb_to_disk.py delete mode 100755 python/deliver_gwb_to_s3.py delete mode 100755 python/enrich_scored_matches.py delete mode 100755 python/filter_grobid_metadata.py delete mode 100755 python/filter_groupworks.py delete mode 100755 python/filter_scored_matches.py delete mode 100755 python/import_grobid_metadata.py delete mode 100755 python/manifest_converter.py create mode 100755 python/scripts/deliver_dumpgrobid_to_s3.py create mode 100755 python/scripts/deliver_gwb_to_disk.py create mode 100755 python/scripts/deliver_gwb_to_s3.py create mode 100755 python/scripts/enrich_scored_matches.py create mode 100755 python/scripts/filter_grobid_metadata.py create mode 100755 python/scripts/filter_groupworks.py create mode 100755 python/scripts/filter_scored_matches.py create mode 100755 python/scripts/import_grobid_metadata.py create mode 100755 python/scripts/manifest_converter.py diff --git a/python/deliver_dumpgrobid_to_s3.py b/python/deliver_dumpgrobid_to_s3.py deleted file mode 100755 index ac0949c..0000000 --- a/python/deliver_dumpgrobid_to_s3.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python3 -""" -Tool for bulk uploading GROBID TEI-XML output from a local filesystem dump -(from HBase) to AWS S3. - -See unpaywall delivery README (in bnewbold's scratch repo) for notes on running -this script for that specific use-case. - -Script takes: -- input TSV: `sha1_hex, json (including grobid0:tei_xml)` - => usually from dumpgrobid, with SHA-1 key transformed to hex, and filtered - down (eg, by join by SHA-1) to a specific manifest -- AWS S3 bucket and prefix - -AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) - -Output: -- errors/stats to stderr -- log to stdout (redirect to file), prefixed by sha1 - -Requires: -- raven (sentry) -- boto3 (AWS S3 client library) -""" - -import os -import sys -import json -import base64 -import hashlib -import argparse -from collections import Counter - -import boto3 -import raven - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() - - -def b32_hex(s): - """copy/pasta from elsewhere""" - s = s.strip().split()[0].lower() - if s.startswith("sha1:"): - s = s[5:] - if len(s) != 32: - return s - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') - - -class DeliverDumpGrobidS3(): - - def __init__(self, s3_bucket, **kwargs): - self.rstore = None - self.count = Counter() - self.s3_bucket = s3_bucket - self.s3_prefix = kwargs.get('s3_prefix', 'grobid/') - self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml') - self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD') - self.s3 = boto3.resource('s3') - self.bucket = self.s3.Bucket(self.s3_bucket) - - def run(self, dump_file): - sys.stderr.write("Starting...\n") - for line in dump_file: - line = line.strip().split('\t') - if len(line) != 2: - self.count['skip-line'] += 1 - continue - sha1_hex, grobid_json = line[0], line[1] - if len(sha1_hex) != 40: - sha1_hex = b32_hex(sha1_hex) - assert len(sha1_hex) == 40 - grobid = json.loads(grobid_json) - tei_xml = grobid.get('tei_xml') - if not tei_xml: - print("{}\tskip empty".format(sha1_hex)) - self.count['skip-empty'] += 1 - continue - tei_xml = tei_xml.encode('utf-8') - # upload to AWS S3 - obj = self.bucket.put_object( - Key="{}{}/{}{}".format( - self.s3_prefix, - sha1_hex[0:4], - sha1_hex, - self.s3_suffix, - StorageClass=self.s3_storage_class), - Body=tei_xml) - print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml))) - self.count['success-s3'] += 1 - sys.stderr.write("{}\n".format(self.count)) - -@sentry_client.capture_exceptions -def main(): - - parser = argparse.ArgumentParser() - parser.add_argument('--s3-bucket', - required=True, - type=str, - help='AWS S3 bucket to upload into') - parser.add_argument('--s3-prefix', - type=str, - default="grobid/", - help='key prefix for items created in bucket') - parser.add_argument('--s3-suffix', - type=str, - default=".tei.xml", - help='file suffix for created objects') - parser.add_argument('--s3-storage-class', - type=str, - default="STANDARD", - help='AWS S3 storage class (redundancy) to use') - parser.add_argument('dump_file', - help="TSV/JSON dump file", - default=sys.stdin, - type=argparse.FileType('r')) - args = parser.parse_args() - - worker = DeliverDumpGrobidS3(**args.__dict__) - worker.run(args.dump_file) - -if __name__ == '__main__': # pragma: no cover - main() diff --git a/python/deliver_gwb_to_disk.py b/python/deliver_gwb_to_disk.py deleted file mode 100755 index 3dcf962..0000000 --- a/python/deliver_gwb_to_disk.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -""" -Tool for bulk copying of PDFs (or other files) from GWB to local disk. -""" - -# XXX: some broken MRO thing going on in here due to python3 object wrangling -# in `wayback` library. Means we can't run pylint. -# pylint: skip-file - -import os -import sys -import json -import base64 -import hashlib -import argparse -from collections import Counter - -import raven -import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore -from gwb.loader import CDXLoaderFactory - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() - - -class DeliverGwbDisk: - - def __init__(self, disk_dir, **kwargs): - self.warc_uri_prefix = kwargs.get('warc_uri_prefix') - self.rstore = None - self.count = Counter() - # /serve/ instead of /download/ doesn't record view count - self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') - # gwb library will fall back to reading from /opt/.petabox/webdata.secret - self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) - self.disk_dir = disk_dir - self.disk_prefix = kwargs.get('disk_prefix', 'pdf/') - self.disk_suffix = kwargs.get('disk_suffix', '.pdf') - - def fetch_warc_content(self, warc_path, offset, c_size): - warc_uri = self.warc_uri_prefix + warc_path - if not self.rstore: - self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( - webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) - try: - gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) - except wayback.exception.ResourceUnavailable: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") - except ValueError as ve: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) - except EOFError as eofe: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) - except TypeError as te: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) - # Note: could consider a generic "except Exception" here, as we get so - # many petabox errors. Do want jobs to fail loud and clear when the - # whole cluster is down though. - - if gwb_record.get_status()[0] != 200: - return None, dict(status="error", - reason="archived HTTP response (WARC) was not 200", - warc_status=gwb_record.get_status()[0]) - - try: - raw_content = gwb_record.open_raw_content().read() - except IncompleteRead as ire: - return None, dict(status="error", - reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) - return raw_content, None - - def run(self, manifest_file): - sys.stderr.write("Ensuring all 65536 base directories exist...\n") - for i in range(256): - for j in range(256): - fpath = "{}/{}{:02x}/{:02x}".format( - self.disk_dir, - self.disk_prefix, - i, - j) - os.makedirs(fpath, exist_ok=True) - sys.stderr.write("Starting...\n") - for line in manifest_file: - self.count['total'] += 1 - line = line.strip().split('\t') - if len(line) != 2: - self.count['skip-line'] += 1 - continue - sha1_hex, cdx_json = line[0], line[1] - assert len(sha1_hex) == 40 - file_cdx = json.loads(cdx_json) - # If warc is not item/file.(w)arc.gz form, skip it - if len(file_cdx['warc'].split('/')) != 2: - sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) - print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) - self.count['skip-warc'] += 1 - continue - # fetch from GWB/petabox via HTTP range-request - blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) - if blob is None and status: - print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) - self.count['err-petabox-fetch'] += 1 - continue - elif not blob: - print("{}\tskip-empty-blob".format(sha1_hex)) - self.count['skip-empty-blob'] += 1 - continue - # verify sha1 - if sha1_hex != hashlib.sha1(blob).hexdigest(): - #assert sha1_hex == hashlib.sha1(blob).hexdigest() - #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) - print("{}\terror petabox-hash-mismatch".format(sha1_hex)) - self.count['err-petabox-hash-mismatch'] += 1 - - self.count['petabox-ok'] += 1 - # save to disk - fpath = "{}/{}{}/{}/{}{}".format( - self.disk_dir, - self.disk_prefix, - sha1_hex[0:2], - sha1_hex[2:4], - sha1_hex, - self.disk_suffix) - with open(fpath, 'wb') as f: - f.write(blob) - print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob))) - self.count['success-disk'] += 1 - sys.stderr.write("{}\n".format(self.count)) - -@sentry_client.capture_exceptions -def main(): - - parser = argparse.ArgumentParser() - parser.add_argument('--disk-dir', - required=True, - type=str, - help='local base directory to save into') - parser.add_argument('--disk-prefix', - type=str, - default="pdf/", - help='directory prefix for items created in bucket') - parser.add_argument('--disk-suffix', - type=str, - default=".pdf", - help='file suffix for created files') - parser.add_argument('--warc-uri-prefix', - type=str, - default='https://archive.org/serve/', - help='URI where WARCs can be found') - parser.add_argument('manifest_file', - help="TSV/JSON manifest file", - default=sys.stdin, - type=argparse.FileType('r')) - args = parser.parse_args() - - worker = DeliverGwbDisk(**args.__dict__) - worker.run(args.manifest_file) - -if __name__ == '__main__': # pragma: no cover - main() diff --git a/python/deliver_gwb_to_s3.py b/python/deliver_gwb_to_s3.py deleted file mode 100755 index 39ac000..0000000 --- a/python/deliver_gwb_to_s3.py +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env python3 -""" -Tool for bulk copying of PDFs (or other files) from GWB to AWS S3. - -See unpaywall delivery README (in bnewbold's scratch repo) for notes on running -this script for that specific use-case. - -Script takes: -- input TSV: `sha1_hex, file:cdx (json)` - => usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest -- AWS S3 bucket and prefix - -AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) - -GWB credentials from environment variable PETABOX_WEBDATA_SECRET, else looks in /opt/.petabox/. - -20x threads on a single machine can process about 340k files in 3 hours; that's -roughly 6 hours per million per host with 32 threads, or 5k files an hour -(1.6/second) per thread. Two large machines should be able to upload 10 million -files in about 30 hours. - -Output: -- errors/stats to stderr -- log to stdout (redirect to file), prefixed by sha1 - -Requires: -- raven (sentry) -- boto3 (AWS S3 client library) -- wayback/GWB libraries -""" - -# XXX: some broken MRO thing going on in here due to python3 object wrangling -# in `wayback` library. Means we can't run pylint. -# pylint: skip-file - -import os -import sys -import json -import base64 -import hashlib -import argparse -from collections import Counter - -import boto3 -import raven -import wayback.exception -from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore -from gwb.loader import CDXLoaderFactory - -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = raven.Client() - - -class DeliverGwbS3: - - def __init__(self, s3_bucket, **kwargs): - self.warc_uri_prefix = kwargs.get('warc_uri_prefix') - self.rstore = None - self.count = Counter() - # /serve/ instead of /download/ doesn't record view count - self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') - # gwb library will fall back to reading from /opt/.petabox/webdata.secret - self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) - self.s3_bucket = s3_bucket - self.s3_prefix = kwargs.get('s3_prefix', 'pdf/') - self.s3_suffix = kwargs.get('s3_suffix', '.pdf') - self.s3 = boto3.resource('s3') - self.bucket = self.s3.Bucket(self.s3_bucket) - - def fetch_warc_content(self, warc_path, offset, c_size): - warc_uri = self.warc_uri_prefix + warc_path - if not self.rstore: - self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( - webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) - try: - gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) - except wayback.exception.ResourceUnavailable: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") - except ValueError as ve: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) - except EOFError as eofe: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) - except TypeError as te: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) - # Note: could consider a generic "except Exception" here, as we get so - # many petabox errors. Do want jobs to fail loud and clear when the - # whole cluster is down though. - - if gwb_record.get_status()[0] != 200: - return None, dict(status="error", - reason="archived HTTP response (WARC) was not 200", - warc_status=gwb_record.get_status()[0]) - - try: - raw_content = gwb_record.open_raw_content().read() - except IncompleteRead as ire: - return None, dict(status="error", - reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) - return raw_content, None - - def run(self, manifest_file): - sys.stderr.write("Starting...\n") - for line in manifest_file: - self.count['total'] += 1 - line = line.strip().split('\t') - if len(line) != 2: - self.count['skip-line'] += 1 - continue - sha1_hex, cdx_json = line[0], line[1] - assert len(sha1_hex) == 40 - file_cdx = json.loads(cdx_json) - # If warc is not item/file.(w)arc.gz form, skip it - if len(file_cdx['warc'].split('/')) != 2: - sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) - print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) - self.count['skip-warc'] += 1 - continue - # fetch from GWB/petabox via HTTP range-request - blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) - if blob is None and status: - print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) - self.count['err-petabox-fetch'] += 1 - continue - elif not blob: - print("{}\tskip-empty-blob".format(sha1_hex)) - self.count['skip-empty-blob'] += 1 - continue - # verify sha1 - if sha1_hex != hashlib.sha1(blob).hexdigest(): - #assert sha1_hex == hashlib.sha1(blob).hexdigest() - #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) - print("{}\terror petabox-hash-mismatch".format(sha1_hex)) - self.count['err-petabox-hash-mismatch'] += 1 - - self.count['petabox-ok'] += 1 - # upload to AWS S3 - obj = self.bucket.put_object( - Key="{}{}/{}{}".format( - self.s3_prefix, - sha1_hex[0:4], - sha1_hex, - self.s3_suffix), - Body=blob) - print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob))) - self.count['success-s3'] += 1 - sys.stderr.write("{}\n".format(self.count)) - -@sentry_client.capture_exceptions -def main(): - - parser = argparse.ArgumentParser() - parser.add_argument('--s3-bucket', - required=True, - type=str, - help='AWS S3 bucket to upload into') - parser.add_argument('--s3-prefix', - type=str, - default="pdf/", - help='key prefix for items created in bucket') - parser.add_argument('--s3-suffix', - type=str, - default=".pdf", - help='file suffix for created objects') - parser.add_argument('--warc-uri-prefix', - type=str, - default='https://archive.org/serve/', - help='URI where WARCs can be found') - parser.add_argument('manifest_file', - help="TSV/JSON manifest file", - default=sys.stdin, - type=argparse.FileType('r')) - args = parser.parse_args() - - worker = DeliverGwbS3(**args.__dict__) - worker.run(args.manifest_file) - -if __name__ == '__main__': # pragma: no cover - main() diff --git a/python/enrich_scored_matches.py b/python/enrich_scored_matches.py deleted file mode 100755 index 9fe1499..0000000 --- a/python/enrich_scored_matches.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -""" -Takes an "joined" TSV input stream: - -- sha1 -- dois (JSON list) -- cdx (JSON object) - - url - - dt - (etc) -- mimetype -- size (integer) - -And outputs JSON objects that are can be imported into fatcat with the -"matched" script. - -No dependencies (only python3 stdlib) -""" - -import sys -import json -import base64 - -def run(): - for line in sys.stdin: - line = line.split('\t') - assert len(line) == 5 - raw_sha1 = line[0].replace('sha1:', '') - dois = json.loads(line[1]) - cdx = json.loads(line[2]) - mimetype = line[3] - size = int(line[4]) - - sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower() - - obj = dict( - sha1=sha1, - dois=dois, - cdx=[dict(url=cdx['url'], dt=cdx['dt'])], - size=size, - mimetype=mimetype) - print(json.dumps(obj)) - -if __name__=='__main__': - run() diff --git a/python/filter_grobid_metadata.py b/python/filter_grobid_metadata.py deleted file mode 100755 index c33ab86..0000000 --- a/python/filter_grobid_metadata.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json - -with open('title_slug_blacklist.txt', 'r') as f: - TITLE_BLACKLIST = [l.strip() for l in f] - -TITLE_BLACKLIST.extend(( - 'editorial', - 'advertisement', - 'bookreviews', - 'reviews', - 'nr', - 'abstractoriginalarticle', - 'originalarticle', - 'impactfactor', - 'articlenumber', -)) - -# The full name can't *entirely* be one of these -NAME_BLACKLIST = ( - 'phd', - 'phdstudent', -) - -def tokenize(s, remove_whitespace=True): - - s.replace(''', "'") - # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()]) - - if remove_whitespace: - s = ''.join(s.split()) - - # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').decode('utf8').replace('?', '') - -assert tokenize("Impact Factor: 2.114") == "impactfactor" -assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST - -def filter_title(title): - - title = title.strip() - if len(title) > 500: - return None - title_slug = tokenize(title, remove_whitespace=True) - if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST: - return None - if title_slug.startswith('nr'): - return None - if title.lower().replace('.', '').startswith('int j '): - return None - - for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "): - if title.startswith(prefix): - title.replace(prefix, '') - - if title.startswith("The Journal of "): - return None - - if "volume" in title_slug and "issue" in title_slug: - return None - - if "downloadedfrom" in title_slug: - return None - - if title_slug.startswith("issn"): - return None - - # titles with too many or too few words in title - title_words = len(title.split()) - if title_words > 50 or title_words < 2: - return None - - # titles with spaces between every letter (more than N such single-char words) - if len([True for w in title.split() if len(w) == 1]) > 12: - return None - - # too deep subtitling/splitting - if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1: - return None - - return title - -def filter_author_name(name): - name = name['name'] - if name.strip().lower().replace(' ', '') in NAME_BLACKLIST: - return None - return ' '.join([t for t in name.split() if tokenize(t)]) - -def filter_authors(l): - return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1] - -def filter_refs(l): - # TODO: - return l - -def filter_journal_name(name): - # same blacklist, for now - if not name: - return None - name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') - slug_name = tokenize(name) - if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º": - return None - for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): - if name.startswith(prefix): - name = name.replace(prefix, '') - for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"): - if name.endswith(suffix): - name = name.replace(suffix, '') - if "====================" in name: - return None - if len(name) > 150: - return None - return ' '.join(name.split()) - -def filter_metadata(obj): - if not (obj.get('title') and obj.get('authors')): - return None - - title = filter_title(obj['title']) - if not title: - #sys.stderr.write("bad title\n") - return None - else: - obj['title'] = title - obj['authors'] = filter_authors(obj['authors']) - obj['citations'] = filter_refs(obj['citations']) - obj['journal']['name'] = filter_journal_name(obj['journal']['name']) - - return obj - -def run(invert=False): - for line in sys.stdin: - fields = line.split('\t') - if len(fields) == 5: - raw = fields[4] - elif len(fields) == 1: - raw = fields[0] - else: - sys.stderr.write("bad line\n") - continue - obj = json.loads(raw) - processed = filter_metadata(obj) - if processed: - if not invert: - processed = json.dumps(processed) - if len(fields) == 5: - fields[4] = processed - else: - fields[0] = processed - print('\t'.join(fields)) - elif invert: - print(raw.strip()) - -if __name__=="__main__": - run(invert="--invert" in sys.argv) diff --git a/python/filter_groupworks.py b/python/filter_groupworks.py deleted file mode 100755 index bbba770..0000000 --- a/python/filter_groupworks.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -""" -Filters an input stream of sorted "groupworks" scalding job, and outputs -"good enough" matches to be merged in fatcat. - -Output is JSON lines which are arrays of releases that could/should be merged -together, either as multiple releases under a single work, or releases merged -into a single entity (via redirects). - -Note that releases *should* only end up on a single line, and only once per -line! - -No dependencies (only python3 stdlib) - -Note: the actual importer/merger should filter the following patterns out: -- container title has "letter" and "diar" -- contribs (authors) contain "&NA;" -- dates differ (not just year) -""" - -import sys -import json - -# out of 1000 -SCORE_THRESHOLD = 900 - -MAX_SLUG_LINES = 50 - -REQUIRE_AUTHORS = False - -def tokenize(s, remove_whitespace=False): - - s.replace(''', "'") - # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) - - if remove_whitespace: - s = ''.join(s.split()) - - # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').replace(b'?', b'') - -def check_authors(left, right): - """ - Intended to check GROBID extracted authors (right) against "known good" - (but maybe not perfect) Crossref metadata authors ("left"). - """ - if not left and not right: - return bool(not REQUIRE_AUTHORS) - if len(left) != len(right): - return False - right_all = tokenize(" ".join(right)) - for i in range(len(left)): - l = left[i].lower().replace('jr.', '').split() - if not l: - return False - l = tokenize(l[-1]) - if len(l) <= 1: - # weird author name (single char) - return False - if l not in right_all: - #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) - return False - return True - -def test_check_authors(): - assert check_authors([], []) == bool(not REQUIRE_AUTHORS) - assert not check_authors([], ['one']) - assert check_authors(['one'], ['one']) - assert check_authors(['one two'], ['One Two']) - assert check_authors(['two'], ['One Two']) - assert check_authors(['two'], ['two, one']) - assert check_authors(['mago'], ['Mr. Magoo']) - assert check_authors(['Mr. Magoo'], ['Mr Magoo']) - assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) - -# Rows are (score, left, right) -def process_group(rows): - - # first pass reduces size of list and generates a linkage graph - filtered = list() - for row in rows: - score = int(row[0]) - if score < SCORE_THRESHOLD: - continue - left = json.loads(row[1]) - right = json.loads(row[2]) - # authors must roughly match - if not check_authors(left['authors'], right['authors']): - continue - # years must match (if defined) - if left['year'] and right['year'] and left['year'] != right['year']: - continue - filtered.append((left, right)) - - if not filtered: - return - - # second pass finds a connected graph and returns that - releases = dict() - group_ids = set() - for row in filtered[1:]: - (left, right) = row - l_id = left['fatcat_release'] - r_id = right['fatcat_release'] - releases[l_id] = left - releases[r_id] = right - if not group_ids: - group_ids.add(l_id) - group_ids.add(r_id) - continue - if l_id in group_ids or r_id in group_ids: - group_ids.add(l_id) - group_ids.add(r_id) - continue - - if not group_ids: - return - - print(json.dumps([releases[ident] for ident in group_ids])) - -def run(): - - last_slug = None - lines = [] - - # group lines by slug, and process in batches - for line in sys.stdin: - line = line.strip().split('\t') - assert len(line) == 4 - slug = line[0] - if last_slug and slug != last_slug and lines: - if len(lines) <= MAX_SLUG_LINES: - process_group(lines) - lines = [] - last_slug = slug - lines.append(line[1:]) - - # catch any remaining - if lines: - process_group(lines) - -if __name__=='__main__': - run() diff --git a/python/filter_scored_matches.py b/python/filter_scored_matches.py deleted file mode 100755 index 3654b87..0000000 --- a/python/filter_scored_matches.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python3 -""" -Filters an input stream of sorted "matchcrossref" scalding job, and outputs -"good enough" matches to be inserted to fatcat. - -Currently works on DOI numbers. Filters for a high enough string match (doesn't -re-do title match), and checks author lists. Filters out slugs with too many -matches, and outputs one-line-per-sha1 (aka, file). - -No dependencies (only python3 stdlib) -""" - -import sys -import json - -# out of 1000 -score_threshold = 900 - -max_slug_lines = 10 - -require_authors = 1 - - -def tokenize(s, remove_whitespace=False): - - s.replace(''', "'") - # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) - - if remove_whitespace: - s = ''.join(s.split()) - - # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').replace(b'?', b'') - -def check_authors(left, right): - """ - Intended to check GROBID extracted authors (right) against "known good" - (but maybe not perfect) Crossref metadata authors ("left"). - """ - if not left: - return False - if len(left) > len(right): - return False - right_all = tokenize(" ".join(right)) - for i in range(len(left)): - l = left[i].lower().replace('jr.', '').split() - if not l: - return False - l = tokenize(l[-1]) - if len(l) <= 1: - # weird author name (single char) - return False - if l not in right_all: - #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) - return False - return True - -def test_check_authors(): - assert not check_authors([], []) - assert not check_authors([], ['one']) - assert check_authors(['one'], ['one']) - assert check_authors(['one two'], ['One Two']) - assert check_authors(['two'], ['One Two']) - assert check_authors(['two'], ['two, one']) - assert check_authors(['mago'], ['Mr. Magoo']) - assert check_authors(['Mr. Magoo'], ['Mr Magoo']) - assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) - -# Rows are (score, grobid, crossref) -def process_group(rows): - if len(rows) > max_slug_lines: - return - keepers = dict() - for row in rows: - score = int(row[0]) - if score < score_threshold: - continue - grobid = json.loads(row[1]) - crossref = json.loads(row[2]) - if not check_authors(crossref['authors'], grobid['authors']): - #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) - continue - else: - #print("YES: {} {}".format(crossref['authors'], grobid['authors'])) - pass - sha1 = grobid['sha1'] - doi = crossref['doi'].lower() - l = keepers.get(sha1, list()) - l.append(doi) - keepers[sha1] = l - for sha1, doi_list in keepers.items(): - print("{}\t{}".format(sha1, json.dumps(doi_list))) - -def run(): - - last_slug = None - lines = [] - - # group lines by slug, and process in batches - for line in sys.stdin: - line = line.strip().split('\t') - assert len(line) == 4 - slug = line[0] - if last_slug and slug != last_slug and lines: - process_group(lines) - lines = [] - last_slug = slug - lines.append(line[1:]) - - # catch any remaining - if lines: - process_group(lines) - -if __name__=='__main__': - run() diff --git a/python/import_grobid_metadata.py b/python/import_grobid_metadata.py deleted file mode 100755 index 3d2e14c..0000000 --- a/python/import_grobid_metadata.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json -import datetime - -MAX_ABSTRACT_BYTES=4096 - -def parse_grobid_json(obj): - - if not obj.get('title'): - return None - - extra = dict() - - if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: - abobj = dict( - mimetype="text/plain", - language=None, - content=obj.get('abstract').strip()) - abstracts = [abobj] - else: - abstracts = None - - contribs = [] - for a in obj.get('authors', []): - c = dict(raw_name=a, role="author") - contribs.append(c) - - refs = [] - for raw in obj.get('citations', []): - extra = dict() - ref = dict() - ref['key'] = raw.get('id') - if raw.get('title'): - ref['title'] = raw['title'].strip() - if raw.get('date'): - try: - year = int(raw['date'].strip()[:4]) - ref['year'] = year - except: - pass - for key in ('volume', 'url', 'issue', 'publisher'): - if raw.get(key): - extra[key] = raw[key].strip() - if raw.get('authors'): - extra['authors'] = [a['name'] for a in raw['authors']] - if extra: - extra = dict(grobid=extra) - else: - extra = None - ref['extra'] = extra - refs.append(ref) - - release_type = "journal-article" - release_date = None - if obj.get('date'): - # TODO: only returns year, ever? how to handle? - release_date = datetime.datetime(year=obj['date'], month=1, day=1) - - if obj.get('doi'): - extra['doi'] = obj['doi'] - if obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] - - extra['is_longtail_oa'] = True - - # TODO: ISSN/eISSN handling? or just journal name lookup? - - if extra: - extra = dict(grobid=extra) - else: - extra = None - - return dict( - title=obj['title'].strip(), - contribs=contribs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), - abstracts=abstracts, - release_type=release_type, - release_date=release_date, - extra=extra) - -def run(): - for line in sys.stdin: - obj = json.loads(line) - out = parse_grobid_json(obj) - if out: - print(out) - -if __name__=="__main__": - run() diff --git a/python/manifest_converter.py b/python/manifest_converter.py deleted file mode 100755 index 35cee5b..0000000 --- a/python/manifest_converter.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python3 -""" -Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of -"match" JSON objects which can be imported into fatcat with matched_import.py - -This was used to convert this manifest: - - https://archive.org/details/ia_papers_manifest_2018-01-25/ - -to JSON format for fast fatcat importing. -""" - -import sys -import json -import sqlite3 - -# iterate over rows in files metadata... -# 1. select all identified DOIs -# => filter based on count -# 2. select all file metadata -# 3. output object - -def or_none(s): - if s is None: - return None - elif type(s) == str and ((not s) or s == "\\N" or s == "-"): - return None - return s - -def process_db(db_path): - - db = sqlite3.connect(db_path) - - for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"): - sha1 = row[0] - dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall() - dois = [d[0] for d in dois] - if not dois: - continue - urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall() - if not urls: - continue - cdx = [dict(url=row[0], dt=row[1]) for row in urls] - obj = dict( - sha1=sha1, - mimetype=or_none(row[1]), - size=(or_none(row[2]) and int(row[2])), - md5=or_none(row[3]), - dois=dois, - cdx=cdx, - ) - dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]) - print(json.dumps(obj)) - -if __name__=="__main__": - process_db(sys.argv[1]) diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py new file mode 100755 index 0000000..ac0949c --- /dev/null +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Tool for bulk uploading GROBID TEI-XML output from a local filesystem dump +(from HBase) to AWS S3. + +See unpaywall delivery README (in bnewbold's scratch repo) for notes on running +this script for that specific use-case. + +Script takes: +- input TSV: `sha1_hex, json (including grobid0:tei_xml)` + => usually from dumpgrobid, with SHA-1 key transformed to hex, and filtered + down (eg, by join by SHA-1) to a specific manifest +- AWS S3 bucket and prefix + +AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) + +Output: +- errors/stats to stderr +- log to stdout (redirect to file), prefixed by sha1 + +Requires: +- raven (sentry) +- boto3 (AWS S3 client library) +""" + +import os +import sys +import json +import base64 +import hashlib +import argparse +from collections import Counter + +import boto3 +import raven + +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + + +def b32_hex(s): + """copy/pasta from elsewhere""" + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + + +class DeliverDumpGrobidS3(): + + def __init__(self, s3_bucket, **kwargs): + self.rstore = None + self.count = Counter() + self.s3_bucket = s3_bucket + self.s3_prefix = kwargs.get('s3_prefix', 'grobid/') + self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml') + self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD') + self.s3 = boto3.resource('s3') + self.bucket = self.s3.Bucket(self.s3_bucket) + + def run(self, dump_file): + sys.stderr.write("Starting...\n") + for line in dump_file: + line = line.strip().split('\t') + if len(line) != 2: + self.count['skip-line'] += 1 + continue + sha1_hex, grobid_json = line[0], line[1] + if len(sha1_hex) != 40: + sha1_hex = b32_hex(sha1_hex) + assert len(sha1_hex) == 40 + grobid = json.loads(grobid_json) + tei_xml = grobid.get('tei_xml') + if not tei_xml: + print("{}\tskip empty".format(sha1_hex)) + self.count['skip-empty'] += 1 + continue + tei_xml = tei_xml.encode('utf-8') + # upload to AWS S3 + obj = self.bucket.put_object( + Key="{}{}/{}{}".format( + self.s3_prefix, + sha1_hex[0:4], + sha1_hex, + self.s3_suffix, + StorageClass=self.s3_storage_class), + Body=tei_xml) + print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml))) + self.count['success-s3'] += 1 + sys.stderr.write("{}\n".format(self.count)) + +@sentry_client.capture_exceptions +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument('--s3-bucket', + required=True, + type=str, + help='AWS S3 bucket to upload into') + parser.add_argument('--s3-prefix', + type=str, + default="grobid/", + help='key prefix for items created in bucket') + parser.add_argument('--s3-suffix', + type=str, + default=".tei.xml", + help='file suffix for created objects') + parser.add_argument('--s3-storage-class', + type=str, + default="STANDARD", + help='AWS S3 storage class (redundancy) to use') + parser.add_argument('dump_file', + help="TSV/JSON dump file", + default=sys.stdin, + type=argparse.FileType('r')) + args = parser.parse_args() + + worker = DeliverDumpGrobidS3(**args.__dict__) + worker.run(args.dump_file) + +if __name__ == '__main__': # pragma: no cover + main() diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py new file mode 100755 index 0000000..3dcf962 --- /dev/null +++ b/python/scripts/deliver_gwb_to_disk.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Tool for bulk copying of PDFs (or other files) from GWB to local disk. +""" + +# XXX: some broken MRO thing going on in here due to python3 object wrangling +# in `wayback` library. Means we can't run pylint. +# pylint: skip-file + +import os +import sys +import json +import base64 +import hashlib +import argparse +from collections import Counter + +import raven +import wayback.exception +from http.client import IncompleteRead +from wayback.resourcestore import ResourceStore +from gwb.loader import CDXLoaderFactory + +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + + +class DeliverGwbDisk: + + def __init__(self, disk_dir, **kwargs): + self.warc_uri_prefix = kwargs.get('warc_uri_prefix') + self.rstore = None + self.count = Counter() + # /serve/ instead of /download/ doesn't record view count + self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + # gwb library will fall back to reading from /opt/.petabox/webdata.secret + self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) + self.disk_dir = disk_dir + self.disk_prefix = kwargs.get('disk_prefix', 'pdf/') + self.disk_suffix = kwargs.get('disk_suffix', '.pdf') + + def fetch_warc_content(self, warc_path, offset, c_size): + warc_uri = self.warc_uri_prefix + warc_path + if not self.rstore: + self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( + webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url)) + try: + gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) + except wayback.exception.ResourceUnavailable: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + except ValueError as ve: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + except EOFError as eofe: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + except TypeError as te: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) + # Note: could consider a generic "except Exception" here, as we get so + # many petabox errors. Do want jobs to fail loud and clear when the + # whole cluster is down though. + + if gwb_record.get_status()[0] != 200: + return None, dict(status="error", + reason="archived HTTP response (WARC) was not 200", + warc_status=gwb_record.get_status()[0]) + + try: + raw_content = gwb_record.open_raw_content().read() + except IncompleteRead as ire: + return None, dict(status="error", + reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + return raw_content, None + + def run(self, manifest_file): + sys.stderr.write("Ensuring all 65536 base directories exist...\n") + for i in range(256): + for j in range(256): + fpath = "{}/{}{:02x}/{:02x}".format( + self.disk_dir, + self.disk_prefix, + i, + j) + os.makedirs(fpath, exist_ok=True) + sys.stderr.write("Starting...\n") + for line in manifest_file: + self.count['total'] += 1 + line = line.strip().split('\t') + if len(line) != 2: + self.count['skip-line'] += 1 + continue + sha1_hex, cdx_json = line[0], line[1] + assert len(sha1_hex) == 40 + file_cdx = json.loads(cdx_json) + # If warc is not item/file.(w)arc.gz form, skip it + if len(file_cdx['warc'].split('/')) != 2: + sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) + print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) + self.count['skip-warc'] += 1 + continue + # fetch from GWB/petabox via HTTP range-request + blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) + if blob is None and status: + print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) + self.count['err-petabox-fetch'] += 1 + continue + elif not blob: + print("{}\tskip-empty-blob".format(sha1_hex)) + self.count['skip-empty-blob'] += 1 + continue + # verify sha1 + if sha1_hex != hashlib.sha1(blob).hexdigest(): + #assert sha1_hex == hashlib.sha1(blob).hexdigest() + #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) + print("{}\terror petabox-hash-mismatch".format(sha1_hex)) + self.count['err-petabox-hash-mismatch'] += 1 + + self.count['petabox-ok'] += 1 + # save to disk + fpath = "{}/{}{}/{}/{}{}".format( + self.disk_dir, + self.disk_prefix, + sha1_hex[0:2], + sha1_hex[2:4], + sha1_hex, + self.disk_suffix) + with open(fpath, 'wb') as f: + f.write(blob) + print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob))) + self.count['success-disk'] += 1 + sys.stderr.write("{}\n".format(self.count)) + +@sentry_client.capture_exceptions +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument('--disk-dir', + required=True, + type=str, + help='local base directory to save into') + parser.add_argument('--disk-prefix', + type=str, + default="pdf/", + help='directory prefix for items created in bucket') + parser.add_argument('--disk-suffix', + type=str, + default=".pdf", + help='file suffix for created files') + parser.add_argument('--warc-uri-prefix', + type=str, + default='https://archive.org/serve/', + help='URI where WARCs can be found') + parser.add_argument('manifest_file', + help="TSV/JSON manifest file", + default=sys.stdin, + type=argparse.FileType('r')) + args = parser.parse_args() + + worker = DeliverGwbDisk(**args.__dict__) + worker.run(args.manifest_file) + +if __name__ == '__main__': # pragma: no cover + main() diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py new file mode 100755 index 0000000..39ac000 --- /dev/null +++ b/python/scripts/deliver_gwb_to_s3.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Tool for bulk copying of PDFs (or other files) from GWB to AWS S3. + +See unpaywall delivery README (in bnewbold's scratch repo) for notes on running +this script for that specific use-case. + +Script takes: +- input TSV: `sha1_hex, file:cdx (json)` + => usually from dumpfilemeta, filtered down (eg, by join by SHA-1) to a specific manifest +- AWS S3 bucket and prefix + +AWS S3 credentials are passed via environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) + +GWB credentials from environment variable PETABOX_WEBDATA_SECRET, else looks in /opt/.petabox/. + +20x threads on a single machine can process about 340k files in 3 hours; that's +roughly 6 hours per million per host with 32 threads, or 5k files an hour +(1.6/second) per thread. Two large machines should be able to upload 10 million +files in about 30 hours. + +Output: +- errors/stats to stderr +- log to stdout (redirect to file), prefixed by sha1 + +Requires: +- raven (sentry) +- boto3 (AWS S3 client library) +- wayback/GWB libraries +""" + +# XXX: some broken MRO thing going on in here due to python3 object wrangling +# in `wayback` library. Means we can't run pylint. +# pylint: skip-file + +import os +import sys +import json +import base64 +import hashlib +import argparse +from collections import Counter + +import boto3 +import raven +import wayback.exception +from http.client import IncompleteRead +from wayback.resourcestore import ResourceStore +from gwb.loader import CDXLoaderFactory + +# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable +sentry_client = raven.Client() + + +class DeliverGwbS3: + + def __init__(self, s3_bucket, **kwargs): + self.warc_uri_prefix = kwargs.get('warc_uri_prefix') + self.rstore = None + self.count = Counter() + # /serve/ instead of /download/ doesn't record view count + self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + # gwb library will fall back to reading from /opt/.petabox/webdata.secret + self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) + self.s3_bucket = s3_bucket + self.s3_prefix = kwargs.get('s3_prefix', 'pdf/') + self.s3_suffix = kwargs.get('s3_suffix', '.pdf') + self.s3 = boto3.resource('s3') + self.bucket = self.s3.Bucket(self.s3_bucket) + + def fetch_warc_content(self, warc_path, offset, c_size): + warc_uri = self.warc_uri_prefix + warc_path + if not self.rstore: + self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( + webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url)) + try: + gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) + except wayback.exception.ResourceUnavailable: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + except ValueError as ve: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + except EOFError as eofe: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + except TypeError as te: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) + # Note: could consider a generic "except Exception" here, as we get so + # many petabox errors. Do want jobs to fail loud and clear when the + # whole cluster is down though. + + if gwb_record.get_status()[0] != 200: + return None, dict(status="error", + reason="archived HTTP response (WARC) was not 200", + warc_status=gwb_record.get_status()[0]) + + try: + raw_content = gwb_record.open_raw_content().read() + except IncompleteRead as ire: + return None, dict(status="error", + reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + return raw_content, None + + def run(self, manifest_file): + sys.stderr.write("Starting...\n") + for line in manifest_file: + self.count['total'] += 1 + line = line.strip().split('\t') + if len(line) != 2: + self.count['skip-line'] += 1 + continue + sha1_hex, cdx_json = line[0], line[1] + assert len(sha1_hex) == 40 + file_cdx = json.loads(cdx_json) + # If warc is not item/file.(w)arc.gz form, skip it + if len(file_cdx['warc'].split('/')) != 2: + sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) + print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) + self.count['skip-warc'] += 1 + continue + # fetch from GWB/petabox via HTTP range-request + blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) + if blob is None and status: + print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) + self.count['err-petabox-fetch'] += 1 + continue + elif not blob: + print("{}\tskip-empty-blob".format(sha1_hex)) + self.count['skip-empty-blob'] += 1 + continue + # verify sha1 + if sha1_hex != hashlib.sha1(blob).hexdigest(): + #assert sha1_hex == hashlib.sha1(blob).hexdigest() + #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) + print("{}\terror petabox-hash-mismatch".format(sha1_hex)) + self.count['err-petabox-hash-mismatch'] += 1 + + self.count['petabox-ok'] += 1 + # upload to AWS S3 + obj = self.bucket.put_object( + Key="{}{}/{}{}".format( + self.s3_prefix, + sha1_hex[0:4], + sha1_hex, + self.s3_suffix), + Body=blob) + print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob))) + self.count['success-s3'] += 1 + sys.stderr.write("{}\n".format(self.count)) + +@sentry_client.capture_exceptions +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument('--s3-bucket', + required=True, + type=str, + help='AWS S3 bucket to upload into') + parser.add_argument('--s3-prefix', + type=str, + default="pdf/", + help='key prefix for items created in bucket') + parser.add_argument('--s3-suffix', + type=str, + default=".pdf", + help='file suffix for created objects') + parser.add_argument('--warc-uri-prefix', + type=str, + default='https://archive.org/serve/', + help='URI where WARCs can be found') + parser.add_argument('manifest_file', + help="TSV/JSON manifest file", + default=sys.stdin, + type=argparse.FileType('r')) + args = parser.parse_args() + + worker = DeliverGwbS3(**args.__dict__) + worker.run(args.manifest_file) + +if __name__ == '__main__': # pragma: no cover + main() diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py new file mode 100755 index 0000000..9fe1499 --- /dev/null +++ b/python/scripts/enrich_scored_matches.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +""" +Takes an "joined" TSV input stream: + +- sha1 +- dois (JSON list) +- cdx (JSON object) + - url + - dt + (etc) +- mimetype +- size (integer) + +And outputs JSON objects that are can be imported into fatcat with the +"matched" script. + +No dependencies (only python3 stdlib) +""" + +import sys +import json +import base64 + +def run(): + for line in sys.stdin: + line = line.split('\t') + assert len(line) == 5 + raw_sha1 = line[0].replace('sha1:', '') + dois = json.loads(line[1]) + cdx = json.loads(line[2]) + mimetype = line[3] + size = int(line[4]) + + sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower() + + obj = dict( + sha1=sha1, + dois=dois, + cdx=[dict(url=cdx['url'], dt=cdx['dt'])], + size=size, + mimetype=mimetype) + print(json.dumps(obj)) + +if __name__=='__main__': + run() diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py new file mode 100755 index 0000000..c33ab86 --- /dev/null +++ b/python/scripts/filter_grobid_metadata.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +import sys +import json + +with open('title_slug_blacklist.txt', 'r') as f: + TITLE_BLACKLIST = [l.strip() for l in f] + +TITLE_BLACKLIST.extend(( + 'editorial', + 'advertisement', + 'bookreviews', + 'reviews', + 'nr', + 'abstractoriginalarticle', + 'originalarticle', + 'impactfactor', + 'articlenumber', +)) + +# The full name can't *entirely* be one of these +NAME_BLACKLIST = ( + 'phd', + 'phdstudent', +) + +def tokenize(s, remove_whitespace=True): + + s.replace(''', "'") + # Remove non-alphanumeric characters + s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()]) + + if remove_whitespace: + s = ''.join(s.split()) + + # Encode as dumb ASCII (TODO: this is horrible) + return s.encode('ascii', 'replace').decode('utf8').replace('?', '') + +assert tokenize("Impact Factor: 2.114") == "impactfactor" +assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST + +def filter_title(title): + + title = title.strip() + if len(title) > 500: + return None + title_slug = tokenize(title, remove_whitespace=True) + if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST: + return None + if title_slug.startswith('nr'): + return None + if title.lower().replace('.', '').startswith('int j '): + return None + + for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "): + if title.startswith(prefix): + title.replace(prefix, '') + + if title.startswith("The Journal of "): + return None + + if "volume" in title_slug and "issue" in title_slug: + return None + + if "downloadedfrom" in title_slug: + return None + + if title_slug.startswith("issn"): + return None + + # titles with too many or too few words in title + title_words = len(title.split()) + if title_words > 50 or title_words < 2: + return None + + # titles with spaces between every letter (more than N such single-char words) + if len([True for w in title.split() if len(w) == 1]) > 12: + return None + + # too deep subtitling/splitting + if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1: + return None + + return title + +def filter_author_name(name): + name = name['name'] + if name.strip().lower().replace(' ', '') in NAME_BLACKLIST: + return None + return ' '.join([t for t in name.split() if tokenize(t)]) + +def filter_authors(l): + return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1] + +def filter_refs(l): + # TODO: + return l + +def filter_journal_name(name): + # same blacklist, for now + if not name: + return None + name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') + slug_name = tokenize(name) + if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º": + return None + for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): + if name.startswith(prefix): + name = name.replace(prefix, '') + for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"): + if name.endswith(suffix): + name = name.replace(suffix, '') + if "====================" in name: + return None + if len(name) > 150: + return None + return ' '.join(name.split()) + +def filter_metadata(obj): + if not (obj.get('title') and obj.get('authors')): + return None + + title = filter_title(obj['title']) + if not title: + #sys.stderr.write("bad title\n") + return None + else: + obj['title'] = title + obj['authors'] = filter_authors(obj['authors']) + obj['citations'] = filter_refs(obj['citations']) + obj['journal']['name'] = filter_journal_name(obj['journal']['name']) + + return obj + +def run(invert=False): + for line in sys.stdin: + fields = line.split('\t') + if len(fields) == 5: + raw = fields[4] + elif len(fields) == 1: + raw = fields[0] + else: + sys.stderr.write("bad line\n") + continue + obj = json.loads(raw) + processed = filter_metadata(obj) + if processed: + if not invert: + processed = json.dumps(processed) + if len(fields) == 5: + fields[4] = processed + else: + fields[0] = processed + print('\t'.join(fields)) + elif invert: + print(raw.strip()) + +if __name__=="__main__": + run(invert="--invert" in sys.argv) diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py new file mode 100755 index 0000000..bbba770 --- /dev/null +++ b/python/scripts/filter_groupworks.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Filters an input stream of sorted "groupworks" scalding job, and outputs +"good enough" matches to be merged in fatcat. + +Output is JSON lines which are arrays of releases that could/should be merged +together, either as multiple releases under a single work, or releases merged +into a single entity (via redirects). + +Note that releases *should* only end up on a single line, and only once per +line! + +No dependencies (only python3 stdlib) + +Note: the actual importer/merger should filter the following patterns out: +- container title has "letter" and "diar" +- contribs (authors) contain "&NA;" +- dates differ (not just year) +""" + +import sys +import json + +# out of 1000 +SCORE_THRESHOLD = 900 + +MAX_SLUG_LINES = 50 + +REQUIRE_AUTHORS = False + +def tokenize(s, remove_whitespace=False): + + s.replace(''', "'") + # Remove non-alphanumeric characters + s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) + + if remove_whitespace: + s = ''.join(s.split()) + + # Encode as dumb ASCII (TODO: this is horrible) + return s.encode('ascii', 'replace').replace(b'?', b'') + +def check_authors(left, right): + """ + Intended to check GROBID extracted authors (right) against "known good" + (but maybe not perfect) Crossref metadata authors ("left"). + """ + if not left and not right: + return bool(not REQUIRE_AUTHORS) + if len(left) != len(right): + return False + right_all = tokenize(" ".join(right)) + for i in range(len(left)): + l = left[i].lower().replace('jr.', '').split() + if not l: + return False + l = tokenize(l[-1]) + if len(l) <= 1: + # weird author name (single char) + return False + if l not in right_all: + #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) + return False + return True + +def test_check_authors(): + assert check_authors([], []) == bool(not REQUIRE_AUTHORS) + assert not check_authors([], ['one']) + assert check_authors(['one'], ['one']) + assert check_authors(['one two'], ['One Two']) + assert check_authors(['two'], ['One Two']) + assert check_authors(['two'], ['two, one']) + assert check_authors(['mago'], ['Mr. Magoo']) + assert check_authors(['Mr. Magoo'], ['Mr Magoo']) + assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + +# Rows are (score, left, right) +def process_group(rows): + + # first pass reduces size of list and generates a linkage graph + filtered = list() + for row in rows: + score = int(row[0]) + if score < SCORE_THRESHOLD: + continue + left = json.loads(row[1]) + right = json.loads(row[2]) + # authors must roughly match + if not check_authors(left['authors'], right['authors']): + continue + # years must match (if defined) + if left['year'] and right['year'] and left['year'] != right['year']: + continue + filtered.append((left, right)) + + if not filtered: + return + + # second pass finds a connected graph and returns that + releases = dict() + group_ids = set() + for row in filtered[1:]: + (left, right) = row + l_id = left['fatcat_release'] + r_id = right['fatcat_release'] + releases[l_id] = left + releases[r_id] = right + if not group_ids: + group_ids.add(l_id) + group_ids.add(r_id) + continue + if l_id in group_ids or r_id in group_ids: + group_ids.add(l_id) + group_ids.add(r_id) + continue + + if not group_ids: + return + + print(json.dumps([releases[ident] for ident in group_ids])) + +def run(): + + last_slug = None + lines = [] + + # group lines by slug, and process in batches + for line in sys.stdin: + line = line.strip().split('\t') + assert len(line) == 4 + slug = line[0] + if last_slug and slug != last_slug and lines: + if len(lines) <= MAX_SLUG_LINES: + process_group(lines) + lines = [] + last_slug = slug + lines.append(line[1:]) + + # catch any remaining + if lines: + process_group(lines) + +if __name__=='__main__': + run() diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py new file mode 100755 index 0000000..3654b87 --- /dev/null +++ b/python/scripts/filter_scored_matches.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +""" +Filters an input stream of sorted "matchcrossref" scalding job, and outputs +"good enough" matches to be inserted to fatcat. + +Currently works on DOI numbers. Filters for a high enough string match (doesn't +re-do title match), and checks author lists. Filters out slugs with too many +matches, and outputs one-line-per-sha1 (aka, file). + +No dependencies (only python3 stdlib) +""" + +import sys +import json + +# out of 1000 +score_threshold = 900 + +max_slug_lines = 10 + +require_authors = 1 + + +def tokenize(s, remove_whitespace=False): + + s.replace(''', "'") + # Remove non-alphanumeric characters + s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) + + if remove_whitespace: + s = ''.join(s.split()) + + # Encode as dumb ASCII (TODO: this is horrible) + return s.encode('ascii', 'replace').replace(b'?', b'') + +def check_authors(left, right): + """ + Intended to check GROBID extracted authors (right) against "known good" + (but maybe not perfect) Crossref metadata authors ("left"). + """ + if not left: + return False + if len(left) > len(right): + return False + right_all = tokenize(" ".join(right)) + for i in range(len(left)): + l = left[i].lower().replace('jr.', '').split() + if not l: + return False + l = tokenize(l[-1]) + if len(l) <= 1: + # weird author name (single char) + return False + if l not in right_all: + #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) + return False + return True + +def test_check_authors(): + assert not check_authors([], []) + assert not check_authors([], ['one']) + assert check_authors(['one'], ['one']) + assert check_authors(['one two'], ['One Two']) + assert check_authors(['two'], ['One Two']) + assert check_authors(['two'], ['two, one']) + assert check_authors(['mago'], ['Mr. Magoo']) + assert check_authors(['Mr. Magoo'], ['Mr Magoo']) + assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + +# Rows are (score, grobid, crossref) +def process_group(rows): + if len(rows) > max_slug_lines: + return + keepers = dict() + for row in rows: + score = int(row[0]) + if score < score_threshold: + continue + grobid = json.loads(row[1]) + crossref = json.loads(row[2]) + if not check_authors(crossref['authors'], grobid['authors']): + #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) + continue + else: + #print("YES: {} {}".format(crossref['authors'], grobid['authors'])) + pass + sha1 = grobid['sha1'] + doi = crossref['doi'].lower() + l = keepers.get(sha1, list()) + l.append(doi) + keepers[sha1] = l + for sha1, doi_list in keepers.items(): + print("{}\t{}".format(sha1, json.dumps(doi_list))) + +def run(): + + last_slug = None + lines = [] + + # group lines by slug, and process in batches + for line in sys.stdin: + line = line.strip().split('\t') + assert len(line) == 4 + slug = line[0] + if last_slug and slug != last_slug and lines: + process_group(lines) + lines = [] + last_slug = slug + lines.append(line[1:]) + + # catch any remaining + if lines: + process_group(lines) + +if __name__=='__main__': + run() diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py new file mode 100755 index 0000000..3d2e14c --- /dev/null +++ b/python/scripts/import_grobid_metadata.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +import sys +import json +import datetime + +MAX_ABSTRACT_BYTES=4096 + +def parse_grobid_json(obj): + + if not obj.get('title'): + return None + + extra = dict() + + if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: + abobj = dict( + mimetype="text/plain", + language=None, + content=obj.get('abstract').strip()) + abstracts = [abobj] + else: + abstracts = None + + contribs = [] + for a in obj.get('authors', []): + c = dict(raw_name=a, role="author") + contribs.append(c) + + refs = [] + for raw in obj.get('citations', []): + extra = dict() + ref = dict() + ref['key'] = raw.get('id') + if raw.get('title'): + ref['title'] = raw['title'].strip() + if raw.get('date'): + try: + year = int(raw['date'].strip()[:4]) + ref['year'] = year + except: + pass + for key in ('volume', 'url', 'issue', 'publisher'): + if raw.get(key): + extra[key] = raw[key].strip() + if raw.get('authors'): + extra['authors'] = [a['name'] for a in raw['authors']] + if extra: + extra = dict(grobid=extra) + else: + extra = None + ref['extra'] = extra + refs.append(ref) + + release_type = "journal-article" + release_date = None + if obj.get('date'): + # TODO: only returns year, ever? how to handle? + release_date = datetime.datetime(year=obj['date'], month=1, day=1) + + if obj.get('doi'): + extra['doi'] = obj['doi'] + if obj['journal'].get('name'): + extra['container_name'] = obj['journal']['name'] + + extra['is_longtail_oa'] = True + + # TODO: ISSN/eISSN handling? or just journal name lookup? + + if extra: + extra = dict(grobid=extra) + else: + extra = None + + return dict( + title=obj['title'].strip(), + contribs=contribs, + publisher=obj['journal'].get('publisher'), + volume=obj['journal'].get('volume'), + issue=obj['journal'].get('issue'), + abstracts=abstracts, + release_type=release_type, + release_date=release_date, + extra=extra) + +def run(): + for line in sys.stdin: + obj = json.loads(line) + out = parse_grobid_json(obj) + if out: + print(out) + +if __name__=="__main__": + run() diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py new file mode 100755 index 0000000..35cee5b --- /dev/null +++ b/python/scripts/manifest_converter.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +""" +Reads a sqlite3 manifest database (IA 2017 style) and outputs a stream of +"match" JSON objects which can be imported into fatcat with matched_import.py + +This was used to convert this manifest: + + https://archive.org/details/ia_papers_manifest_2018-01-25/ + +to JSON format for fast fatcat importing. +""" + +import sys +import json +import sqlite3 + +# iterate over rows in files metadata... +# 1. select all identified DOIs +# => filter based on count +# 2. select all file metadata +# 3. output object + +def or_none(s): + if s is None: + return None + elif type(s) == str and ((not s) or s == "\\N" or s == "-"): + return None + return s + +def process_db(db_path): + + db = sqlite3.connect(db_path) + + for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"): + sha1 = row[0] + dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall() + dois = [d[0] for d in dois] + if not dois: + continue + urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall() + if not urls: + continue + cdx = [dict(url=row[0], dt=row[1]) for row in urls] + obj = dict( + sha1=sha1, + mimetype=or_none(row[1]), + size=(or_none(row[2]) and int(row[2])), + md5=or_none(row[3]), + dois=dois, + cdx=cdx, + ) + dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]) + print(json.dumps(obj)) + +if __name__=="__main__": + process_db(sys.argv[1]) -- cgit v1.2.3