diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
commit | 05bd7cbcc62588e431c5efd533189e246b2a997e (patch) | |
tree | abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts | |
parent | f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff) | |
download | sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip |
make fmt
Diffstat (limited to 'python/scripts')
-rwxr-xr-x | python/scripts/arabesque2ingestrequest.py | 33 | ||||
-rwxr-xr-x | python/scripts/archiveorg_fileset.py | 39 | ||||
-rwxr-xr-x | python/scripts/cdx_collection.py | 24 | ||||
-rwxr-xr-x | python/scripts/covid2ingestrequest.py | 12 | ||||
-rwxr-xr-x | python/scripts/deliver_dumpgrobid_to_s3.py | 11 | ||||
-rwxr-xr-x | python/scripts/deliver_gwb_to_disk.py | 71 | ||||
-rwxr-xr-x | python/scripts/deliver_gwb_to_s3.py | 66 | ||||
-rwxr-xr-x | python/scripts/doaj2ingestrequest.py | 15 | ||||
-rwxr-xr-x | python/scripts/enrich_scored_matches.py | 14 | ||||
-rwxr-xr-x | python/scripts/filter_grobid_metadata.py | 18 | ||||
-rwxr-xr-x | python/scripts/filter_groupworks.py | 8 | ||||
-rwxr-xr-x | python/scripts/filter_scored_matches.py | 7 | ||||
-rwxr-xr-x | python/scripts/grobid_affiliations.py | 6 | ||||
-rwxr-xr-x | python/scripts/import_grobid_metadata.py | 31 | ||||
-rwxr-xr-x | python/scripts/ingestrequest_row2json.py | 11 | ||||
-rwxr-xr-x | python/scripts/manifest_converter.py | 5 | ||||
-rwxr-xr-x | python/scripts/oai2ingestrequest.py | 19 | ||||
-rwxr-xr-x | python/scripts/pdf_thumbnail.py | 7 | ||||
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 19 |
19 files changed, 230 insertions, 186 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index 69fe320..9cc9055 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ This script is intended to be used for backfill ingest of old crawls. It can also be used as a fast path for getting freshly crawled content into fatcat if @@ -36,37 +35,35 @@ def run(args): }, } if args.release_stage: - assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update') + assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', + 'update') request['release_stage'] = args.release_stage print("{}".format(json.dumps(request, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--link-source', - required=True, - help="link_source to include in request") - parser.add_argument('--extid-type', - required=True, - help="extid to encode identifier as") + required=True, + help="link_source to include in request") + parser.add_argument('--extid-type', required=True, help="extid to encode identifier as") parser.add_argument('--ingest-type', - default="pdf", - help="ingest type (pdf, html, xml, etc)") + default="pdf", + help="ingest type (pdf, html, xml, etc)") parser.add_argument('--ingest-request-source', - default="arabesque", - help="to include in request") - parser.add_argument('--release-stage', - default=None, - help="to include in request") + default="arabesque", + help="to include in request") + parser.add_argument('--release-stage', default=None, help="to include in request") parser.add_argument('json_file', - help="arabesque output file to use", - type=argparse.FileType('r')) + help="arabesque output file to use", + type=argparse.FileType('r')) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) + if __name__ == '__main__': main() diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py index 86ca062..83c04e3 100755 --- a/python/scripts/archiveorg_fileset.py +++ b/python/scripts/archiveorg_fileset.py @@ -23,11 +23,9 @@ FORMAT_TO_MIMETYPE = { 'RAR': 'application/vnd.rar', 'TAR': 'application/x-tar', '7z': 'application/x-7z-compressed', - 'HTML': 'text/html', 'Text': 'text/plain', 'PDF': 'application/pdf', - 'CSV': 'text/csv', 'XML': 'application/xml', 'JSON': 'application/json', @@ -36,20 +34,17 @@ FORMAT_TO_MIMETYPE = { #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx #'application/vnd.ms-excel', # .xls #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx - - 'MP3': 'audio/mpeg', # .mp3 - - 'MP4': 'video/mp4', # .mp4 - 'MPEG': 'video/mpeg', # .mpeg - + 'MP3': 'audio/mpeg', # .mp3 + 'MP4': 'video/mp4', # .mp4 + 'MPEG': 'video/mpeg', # .mpeg 'JPEG': 'image/jpeg', 'GIF': 'image/gif', 'PNG': 'image/png', 'TIFF': 'image/tiff', - 'Unknown': None, } + def want_file(f: dict, item_name: str) -> bool: """ Filters IA API files @@ -57,12 +52,12 @@ def want_file(f: dict, item_name: str) -> bool: if f.source != 'original': return False for suffix in [ - '_meta.sqlite', - '_archive.torrent', - '_itemimage.jpg', - '_meta.xml', - '_thumb.png', - '_files.xml', + '_meta.sqlite', + '_archive.torrent', + '_itemimage.jpg', + '_meta.xml', + '_thumb.png', + '_files.xml', ]: if f.name == item_name + suffix or f.name == item_name.lower() + suffix: return False @@ -74,6 +69,7 @@ def want_file(f: dict, item_name: str) -> bool: return False return True + def parse_file(f: dict) -> dict: """ Takes an IA API file and turns it in to a fatcat fileset manifest file @@ -93,6 +89,7 @@ def parse_file(f: dict) -> dict: mf['extra'] = dict(mimetype=mimetype) return mf + def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession): print(f"processing item={item_name} release_id={release_id}", file=sys.stderr) if release_id.startswith('release_'): @@ -104,18 +101,17 @@ def item_to_fileset(item_name: str, release_id: str, session: internetarchive.Ar manifest = [parse_file(f) for f in item_files if want_file(f, item_name)] fileset = { 'manifest': manifest, - 'urls': [ - { - 'rel': 'archive', - 'url': f'https://archive.org/download/{item_name}/', - }, - ], + 'urls': [{ + 'rel': 'archive', + 'url': f'https://archive.org/download/{item_name}/', + }, ], 'release_ids': [release_id], #extra={}, } print(json.dumps(fileset)) return fileset + def main(): session = internetarchive.get_session() if len(sys.argv) == 3: @@ -133,5 +129,6 @@ def main(): release_id = fields[1] item_to_fileset(item_name, release_id=release_id, session=session) + if __name__ == '__main__': main() diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py index 5e33def..aa78aec 100755 --- a/python/scripts/cdx_collection.py +++ b/python/scripts/cdx_collection.py @@ -35,9 +35,7 @@ def run(): print("Looking up collection: {}".format(collection)) # First fetch list - item_list = list( - ia.search_items( - query="collection:{} mediatype:web".format(collection))) + item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection))) if len(item_list) == 0: print("No items found, bailing") @@ -50,11 +48,12 @@ def run(): item = item['identifier'] # TODO: error handling try: - ret = ia.download(item, files=[item + '.cdx.gz'], - verbose=True, - destdir=tempdir, - no_directory=True, - retries=1000) + ret = ia.download(item, + files=[item + '.cdx.gz'], + verbose=True, + destdir=tempdir, + no_directory=True, + retries=1000) status = ret and status except requests.exceptions.ReadTimeout as rt: print(str(rt), file=sys.stderr) @@ -69,14 +68,13 @@ def run(): # Combine files print("Merging and re-compressing all CDX files...") #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir), - subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), - shell=True) + subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), shell=True) # Move and cleanup - shutil.move('{}/combined.gz'.format(tempdir), - '{}.cdx.gz'.format(collection)) + shutil.move('{}/combined.gz'.format(tempdir), '{}.cdx.gz'.format(collection)) print("Done!") -if __name__=='__main__': + +if __name__ == '__main__': run() diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py index 1b7c85c..4714b60 100755 --- a/python/scripts/covid2ingestrequest.py +++ b/python/scripts/covid2ingestrequest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Transform an unpaywall dump (JSON) into ingest requests. """ @@ -21,7 +20,6 @@ def transform_cnki(obj): requests = [] assert obj['cnki_id'] - requests = [] requests.append({ 'base_url': canon(obj['info_url']), @@ -41,6 +39,7 @@ def transform_cnki(obj): return requests + def transform_wanfang(obj): assert obj['wanfang_id'] @@ -68,17 +67,18 @@ def run(args): for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('json_file', - help="COVID-19 metadata file to use", - type=argparse.FileType('r')) + help="COVID-19 metadata file to use", + type=argparse.FileType('r')) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) + if __name__ == '__main__': main() diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py index 62a85e6..3b53235 100755 --- a/python/scripts/deliver_dumpgrobid_to_s3.py +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -49,7 +49,6 @@ def b32_hex(s): class DeliverDumpGrobidS3(): - def __init__(self, s3_bucket, **kwargs): self.rstore = None self.count = Counter() @@ -80,11 +79,7 @@ class DeliverDumpGrobidS3(): tei_xml = tei_xml.encode('utf-8') # upload to AWS S3 obj = self.bucket.put_object( - Key="{}{}/{}{}".format( - self.s3_prefix, - sha1_hex[0:4], - sha1_hex, - self.s3_suffix), + Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix), Body=tei_xml, StorageClass=self.s3_storage_class, ) @@ -92,6 +87,7 @@ class DeliverDumpGrobidS3(): self.count['success-s3'] += 1 sys.stderr.write("{}\n".format(self.count)) + @sentry_client.capture_exceptions def main(): @@ -121,5 +117,6 @@ def main(): worker = DeliverDumpGrobidS3(**args.__dict__) worker.run(args.dump_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == '__main__': # pragma: no cover main() diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py index ab1906a..ca19b97 100755 --- a/python/scripts/deliver_gwb_to_disk.py +++ b/python/scripts/deliver_gwb_to_disk.py @@ -26,7 +26,6 @@ sentry_client = raven.Client() class DeliverGwbDisk: - def __init__(self, disk_dir, **kwargs): self.warc_uri_prefix = kwargs.get('warc_uri_prefix') self.rstore = None @@ -34,7 +33,8 @@ class DeliverGwbDisk: # /serve/ instead of /download/ doesn't record view count self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') # gwb library will fall back to reading from /opt/.petabox/webdata.secret - self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) + self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', + os.environ.get('PETABOX_WEBDATA_SECRET')) self.disk_dir = disk_dir self.disk_prefix = kwargs.get('disk_prefix', 'pdf/') self.disk_suffix = kwargs.get('disk_suffix', '.pdf') @@ -42,48 +42,56 @@ class DeliverGwbDisk: def fetch_warc_content(self, warc_path, offset, c_size): warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: - self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( - webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + self.rstore = ResourceStore( + loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url)) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)" + ) except ValueError as ve: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})". + format(ve)) except EOFError as eofe: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})". + format(eofe)) except TypeError as te: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) + return None, dict( + status="error", + reason= + "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)" + .format(te)) # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. if gwb_record.get_status()[0] != 200: return None, dict(status="error", - reason="archived HTTP response (WARC) was not 200", - warc_status=gwb_record.get_status()[0]) + reason="archived HTTP response (WARC) was not 200", + warc_status=gwb_record.get_status()[0]) try: raw_content = gwb_record.open_raw_content().read() except IncompleteRead as ire: - return None, dict(status="error", - reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + return None, dict( + status="error", + reason= + "failed to read actual file contents from wayback/petabox (IncompleteRead: {})". + format(ire)) return raw_content, None def run(self, manifest_file): sys.stderr.write("Ensuring all 65536 base directories exist...\n") for i in range(256): for j in range(256): - fpath = "{}/{}{:02x}/{:02x}".format( - self.disk_dir, - self.disk_prefix, - i, - j) + fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j) os.makedirs(fpath, exist_ok=True) sys.stderr.write("Starting...\n") for line in manifest_file: @@ -102,9 +110,11 @@ class DeliverGwbDisk: self.count['skip-warc'] += 1 continue # fetch from GWB/petabox via HTTP range-request - blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) + blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], + file_cdx['c_size']) if blob is None and status: - print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) + print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], + status['reason'])) self.count['err-petabox-fetch'] += 1 continue elif not blob: @@ -120,19 +130,15 @@ class DeliverGwbDisk: self.count['petabox-ok'] += 1 # save to disk - fpath = "{}/{}{}/{}/{}{}".format( - self.disk_dir, - self.disk_prefix, - sha1_hex[0:2], - sha1_hex[2:4], - sha1_hex, - self.disk_suffix) + fpath = "{}/{}{}/{}/{}{}".format(self.disk_dir, self.disk_prefix, sha1_hex[0:2], + sha1_hex[2:4], sha1_hex, self.disk_suffix) with open(fpath, 'wb') as f: f.write(blob) print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob))) self.count['success-disk'] += 1 sys.stderr.write("{}\n".format(self.count)) + @sentry_client.capture_exceptions def main(): @@ -162,5 +168,6 @@ def main(): worker = DeliverGwbDisk(**args.__dict__) worker.run(args.manifest_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == '__main__': # pragma: no cover main() diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py index f103205..f9b3b19 100755 --- a/python/scripts/deliver_gwb_to_s3.py +++ b/python/scripts/deliver_gwb_to_s3.py @@ -53,7 +53,6 @@ sentry_client = raven.Client() class DeliverGwbS3: - def __init__(self, s3_bucket, **kwargs): self.warc_uri_prefix = kwargs.get('warc_uri_prefix') self.rstore = None @@ -61,7 +60,8 @@ class DeliverGwbS3: # /serve/ instead of /download/ doesn't record view count self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') # gwb library will fall back to reading from /opt/.petabox/webdata.secret - self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET')) + self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', + os.environ.get('PETABOX_WEBDATA_SECRET')) self.s3_bucket = s3_bucket self.s3_prefix = kwargs.get('s3_prefix', 'pdf/') self.s3_suffix = kwargs.get('s3_suffix', '.pdf') @@ -71,37 +71,49 @@ class DeliverGwbS3: def fetch_warc_content(self, warc_path, offset, c_size): warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: - self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( - webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + self.rstore = ResourceStore( + loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url)) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)" + ) except ValueError as ve: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})". + format(ve)) except EOFError as eofe: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + return None, dict( + status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})". + format(eofe)) except TypeError as te: - return None, dict(status="error", - reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te)) + return None, dict( + status="error", + reason= + "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)" + .format(te)) # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. if gwb_record.get_status()[0] != 200: return None, dict(status="error", - reason="archived HTTP response (WARC) was not 200", - warc_status=gwb_record.get_status()[0]) + reason="archived HTTP response (WARC) was not 200", + warc_status=gwb_record.get_status()[0]) try: raw_content = gwb_record.open_raw_content().read() except IncompleteRead as ire: - return None, dict(status="error", - reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + return None, dict( + status="error", + reason= + "failed to read actual file contents from wayback/petabox (IncompleteRead: {})". + format(ire)) return raw_content, None def run(self, manifest_file): @@ -122,9 +134,11 @@ class DeliverGwbS3: self.count['skip-warc'] += 1 continue # fetch from GWB/petabox via HTTP range-request - blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size']) + blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], + file_cdx['c_size']) if blob is None and status: - print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason'])) + print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], + status['reason'])) self.count['err-petabox-fetch'] += 1 continue elif not blob: @@ -140,17 +154,14 @@ class DeliverGwbS3: self.count['petabox-ok'] += 1 # upload to AWS S3 - obj = self.bucket.put_object( - Key="{}{}/{}{}".format( - self.s3_prefix, - sha1_hex[0:4], - sha1_hex, - self.s3_suffix), - Body=blob) + obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], + sha1_hex, self.s3_suffix), + Body=blob) print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob))) self.count['success-s3'] += 1 sys.stderr.write("{}\n".format(self.count)) + @sentry_client.capture_exceptions def main(): @@ -180,5 +191,6 @@ def main(): worker = DeliverGwbS3(**args.__dict__) worker.run(args.manifest_file) -if __name__ == '__main__': # pragma: no cover + +if __name__ == '__main__': # pragma: no cover main() diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index 15b30a0..84a2c2c 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Transform an DOAJ article dump (JSON) into ingest requests. @@ -42,22 +41,22 @@ CONTENT_TYPE_MAP = { "abstract": [], "doc": [], "": ["pdf"], - "doi": ["pdf"], "url": ["pdf"], "fulltext": ["pdf"], "anySimpleType": ["pdf"], - "application/pdf": ["pdf"], "html": ["html", "pdf"], "text/html": ["html", "pdf"], "xml": ["xml"], } + def canon(s: str) -> str: parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) + def transform(obj: dict) -> List[dict]: """ Transforms from a single DOAJ object to zero or more ingest requests. @@ -118,6 +117,7 @@ def transform(obj: dict) -> List[dict]: return requests + def run(args) -> None: for l in args.json_file: if not l.strip(): @@ -128,17 +128,18 @@ def run(args) -> None: for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main() -> None: - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('json_file', - help="DOAJ article dump file to use", - type=argparse.FileType('r')) + help="DOAJ article dump file to use", + type=argparse.FileType('r')) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) + if __name__ == '__main__': main() diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py index 3085346..54c3d5f 100755 --- a/python/scripts/enrich_scored_matches.py +++ b/python/scripts/enrich_scored_matches.py @@ -34,13 +34,13 @@ def run(): sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower() - obj = dict( - sha1=sha1, - dois=dois, - cdx=[dict(url=cdx['url'], dt=cdx['dt'])], - size=size, - mimetype=mimetype) + obj = dict(sha1=sha1, + dois=dois, + cdx=[dict(url=cdx['url'], dt=cdx['dt'])], + size=size, + mimetype=mimetype) print(json.dumps(obj)) -if __name__=='__main__': + +if __name__ == '__main__': run() diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index d0666ce..a474393 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -24,6 +24,7 @@ NAME_DENYLIST = ( 'phdstudent', ) + def tokenize(s, remove_whitespace=True): s.replace(''', "'") @@ -36,9 +37,11 @@ def tokenize(s, remove_whitespace=True): # Encode as dumb ASCII (TODO: this is horrible) return s.encode('ascii', 'replace').decode('utf8').replace('?', '') + assert tokenize("Impact Factor: 2.114") == "impactfactor" assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST + def filter_title(title): title = title.strip() @@ -83,19 +86,23 @@ def filter_title(title): return title + def filter_author_name(name): name = name['name'] if name.strip().lower().replace(' ', '') in NAME_DENYLIST: return None return ' '.join([t for t in name.split() if tokenize(t)]) + def filter_authors(l): return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1] + def filter_refs(l): # TODO: return l + def filter_journal_name(name): # same denylist, for now if not name: @@ -104,10 +111,12 @@ def filter_journal_name(name): slug_name = tokenize(name) if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º": return None - for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): + for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", + "Research Article ", "Available online www.jocpr.com "): if name.startswith(prefix): name = name.replace(prefix, '') - for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"): + for suffix in (" Available online at www.sciarena.com", " Original Article", + " Available online at", " ISSN", " ISSUE"): if name.endswith(suffix): name = name.replace(suffix, '') if "====================" in name: @@ -116,6 +125,7 @@ def filter_journal_name(name): return None return ' '.join(name.split()) + def filter_metadata(obj): if not (obj.get('title') and obj.get('authors')): return None @@ -132,6 +142,7 @@ def filter_metadata(obj): return obj + def run(invert=False): for line in sys.stdin: fields = line.split('\t') @@ -155,5 +166,6 @@ def run(invert=False): elif invert: print(raw.strip()) -if __name__=="__main__": + +if __name__ == "__main__": run(invert="--invert" in sys.argv) diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py index 494da71..fda9098 100755 --- a/python/scripts/filter_groupworks.py +++ b/python/scripts/filter_groupworks.py @@ -28,6 +28,7 @@ MAX_SLUG_LINES = 50 REQUIRE_AUTHORS = False + def tokenize(s, remove_whitespace=False): s.replace(''', "'") @@ -40,6 +41,7 @@ def tokenize(s, remove_whitespace=False): # Encode as dumb ASCII (TODO: this is horrible) return s.encode('ascii', 'replace').replace(b'?', b'') + def check_authors(left, right): """ Intended to check GROBID extracted authors (right) against "known good" @@ -63,6 +65,7 @@ def check_authors(left, right): return False return True + def test_check_authors(): assert check_authors([], []) == bool(not REQUIRE_AUTHORS) assert not check_authors([], ['one']) @@ -74,6 +77,7 @@ def test_check_authors(): assert check_authors(['Mr. Magoo'], ['Mr Magoo']) assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + # Rows are (score, left, right) def process_group(rows): @@ -119,6 +123,7 @@ def process_group(rows): print(json.dumps([releases[ident] for ident in group_ids])) + def run(): last_slug = None @@ -140,5 +145,6 @@ def run(): if lines: process_group(lines) -if __name__=='__main__': + +if __name__ == '__main__': run() diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py index abf81bd..3251852 100755 --- a/python/scripts/filter_scored_matches.py +++ b/python/scripts/filter_scored_matches.py @@ -33,6 +33,7 @@ def tokenize(s, remove_whitespace=False): # Encode as dumb ASCII (TODO: this is horrible) return s.encode('ascii', 'replace').replace(b'?', b'') + def check_authors(left, right): """ Intended to check GROBID extracted authors (right) against "known good" @@ -56,6 +57,7 @@ def check_authors(left, right): return False return True + def test_check_authors(): assert not check_authors([], []) assert not check_authors([], ['one']) @@ -67,6 +69,7 @@ def test_check_authors(): assert check_authors(['Mr. Magoo'], ['Mr Magoo']) assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + # Rows are (score, grobid, crossref) def process_group(rows): if len(rows) > max_slug_lines: @@ -92,6 +95,7 @@ def process_group(rows): for sha1, doi_list in keepers.items(): print("{}\t{}".format(sha1, json.dumps(doi_list))) + def run(): last_slug = None @@ -112,5 +116,6 @@ def run(): if lines: process_group(lines) -if __name__=='__main__': + +if __name__ == '__main__': run() diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index d391f60..b42153c 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction output, converts the XML to JSON, filters out raw affiliation strings, and @@ -24,10 +23,12 @@ def parse_hbase(line): tei_xml = obj['tei_xml'] return sha1hex, tei_xml + def parse_pg(line): obj = json.loads(line) return obj['sha1hex'], obj['tei_xml'] + def run(mode='hbase'): for line in sys.stdin: if mode == 'hbase': @@ -49,5 +50,6 @@ def run(mode='hbase'): affiliations = [json.loads(a) for a in affiliations] print('\t'.join([sha1hex, json.dumps(affiliations)])) -if __name__=='__main__': + +if __name__ == '__main__': run() diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py index 8aee0be..c9bc134 100755 --- a/python/scripts/import_grobid_metadata.py +++ b/python/scripts/import_grobid_metadata.py @@ -4,7 +4,8 @@ import datetime import json import sys -MAX_ABSTRACT_BYTES=4096 +MAX_ABSTRACT_BYTES = 4096 + def parse_grobid_json(obj): @@ -14,10 +15,7 @@ def parse_grobid_json(obj): extra = dict() if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: - abobj = dict( - mimetype="text/plain", - language=None, - content=obj.get('abstract').strip()) + abobj = dict(mimetype="text/plain", language=None, content=obj.get('abstract').strip()) abstracts = [abobj] else: abstracts = None @@ -72,16 +70,16 @@ def parse_grobid_json(obj): else: extra = None - return dict( - title=obj['title'].strip(), - contribs=contribs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), - abstracts=abstracts, - release_type=release_type, - release_date=release_date, - extra=extra) + return dict(title=obj['title'].strip(), + contribs=contribs, + publisher=obj['journal'].get('publisher'), + volume=obj['journal'].get('volume'), + issue=obj['journal'].get('issue'), + abstracts=abstracts, + release_type=release_type, + release_date=release_date, + extra=extra) + def run(): for line in sys.stdin: @@ -90,5 +88,6 @@ def run(): if out: print(out) -if __name__=="__main__": + +if __name__ == "__main__": run() diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index acba2a8..70731d5 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ This script is used to turn ingest request postgres rows (in JSON export format) back in to regular ingest request JSON. @@ -25,6 +24,7 @@ def transform(row): row['fatcat'] = dict(release_ident=extra['release_ident']) return row + def run(args): for l in args.json_file: if not l.strip(): @@ -35,17 +35,18 @@ def run(args): print(l, file=sys.stderr) print(json.dumps(req, sort_keys=True)) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('json_file', - help="arabesque output file to use", - type=argparse.FileType('r')) + help="arabesque output file to use", + type=argparse.FileType('r')) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) + if __name__ == '__main__': main() diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py index 8267003..24e22fd 100755 --- a/python/scripts/manifest_converter.py +++ b/python/scripts/manifest_converter.py @@ -20,6 +20,7 @@ import sys # 2. select all file metadata # 3. output object + def or_none(s): if s is None: return None @@ -27,6 +28,7 @@ def or_none(s): return None return s + def process_db(db_path): db = sqlite3.connect(db_path) @@ -52,5 +54,6 @@ def process_db(db_path): dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]) print(json.dumps(obj)) -if __name__=="__main__": + +if __name__ == "__main__": process_db(sys.argv[1]) diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py index 315b8d2..1f4a19f 100755 --- a/python/scripts/oai2ingestrequest.py +++ b/python/scripts/oai2ingestrequest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Transform an OAI-PMH bulk dump (JSON) into ingest requests. @@ -33,17 +32,19 @@ DOMAIN_BLOCKLIST = [ ] RELEASE_STAGE_MAP = { - 'info:eu-repo/semantics/draftVersion': 'draft', + 'info:eu-repo/semantics/draftVersion': 'draft', 'info:eu-repo/semantics/submittedVersion': 'submitted', - 'info:eu-repo/semantics/acceptedVersion': 'accepted', + 'info:eu-repo/semantics/acceptedVersion': 'accepted', 'info:eu-repo/semantics/publishedVersion': 'published', - 'info:eu-repo/semantics/updatedVersion': 'updated', + 'info:eu-repo/semantics/updatedVersion': 'updated', } + def canon(s): parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) + def transform(obj): """ Transforms from a single OAI-PMH object to zero or more ingest requests. @@ -112,6 +113,7 @@ def transform(obj): return requests + def run(args): for l in args.json_file: if not l.strip(): @@ -122,17 +124,18 @@ def run(args): for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('json_file', - help="OAI-PMH dump file to use (usually stdin)", - type=argparse.FileType('r')) + help="OAI-PMH dump file to use (usually stdin)", + type=argparse.FileType('r')) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) + if __name__ == '__main__': main() diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py index 71fbe54..3f81b3b 100755 --- a/python/scripts/pdf_thumbnail.py +++ b/python/scripts/pdf_thumbnail.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc). @@ -23,12 +22,14 @@ def run(inpath, outpath): renderer = poppler.PageRenderer() full_page = renderer.render_page(page) - img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1) - img.thumbnail((180,300), Image.BICUBIC) + img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', + "BGRA", 0, 1) + img.thumbnail((180, 300), Image.BICUBIC) #img.thumbnail((360,600), Image.BICUBIC) img.save(outpath) #img.save(outpath, quality=95) + if __name__ == '__main__': if len(sys.argv) != 3: print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr) diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index 590b429..b79f316 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Transform an unpaywall dump (JSON) into ingest requests. """ @@ -26,17 +25,19 @@ DOMAIN_BLOCKLIST = [ ] RELEASE_STAGE_MAP = { - 'draftVersion': 'draft', + 'draftVersion': 'draft', 'submittedVersion': 'submitted', - 'acceptedVersion': 'accepted', + 'acceptedVersion': 'accepted', 'publishedVersion': 'published', - 'updatedVersion': 'updated', + 'updatedVersion': 'updated', } + def canon(s): parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) + def transform(obj): """ Transforms from a single unpaywall object to zero or more ingest requests. @@ -86,6 +87,7 @@ def transform(obj): return requests + def run(args): for l in args.json_file: if not l.strip(): @@ -96,17 +98,18 @@ def run(args): for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('json_file', - help="unpaywall dump file to use", - type=argparse.FileType('r')) + help="unpaywall dump file to use", + type=argparse.FileType('r')) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) + if __name__ == '__main__': main() |