aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
commit05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
treeabcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts
parentf3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
downloadsandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz
sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip
make fmt
Diffstat (limited to 'python/scripts')
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py33
-rwxr-xr-xpython/scripts/archiveorg_fileset.py39
-rwxr-xr-xpython/scripts/cdx_collection.py24
-rwxr-xr-xpython/scripts/covid2ingestrequest.py12
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py11
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py71
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py66
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py15
-rwxr-xr-xpython/scripts/enrich_scored_matches.py14
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py18
-rwxr-xr-xpython/scripts/filter_groupworks.py8
-rwxr-xr-xpython/scripts/filter_scored_matches.py7
-rwxr-xr-xpython/scripts/grobid_affiliations.py6
-rwxr-xr-xpython/scripts/import_grobid_metadata.py31
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py11
-rwxr-xr-xpython/scripts/manifest_converter.py5
-rwxr-xr-xpython/scripts/oai2ingestrequest.py19
-rwxr-xr-xpython/scripts/pdf_thumbnail.py7
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py19
19 files changed, 230 insertions, 186 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 69fe320..9cc9055 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
This script is intended to be used for backfill ingest of old crawls. It can
also be used as a fast path for getting freshly crawled content into fatcat if
@@ -36,37 +35,35 @@ def run(args):
},
}
if args.release_stage:
- assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
+ assert args.release_stage in ('published', 'submitted', 'accepted', 'draft',
+ 'update')
request['release_stage'] = args.release_stage
print("{}".format(json.dumps(request, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--link-source',
- required=True,
- help="link_source to include in request")
- parser.add_argument('--extid-type',
- required=True,
- help="extid to encode identifier as")
+ required=True,
+ help="link_source to include in request")
+ parser.add_argument('--extid-type', required=True, help="extid to encode identifier as")
parser.add_argument('--ingest-type',
- default="pdf",
- help="ingest type (pdf, html, xml, etc)")
+ default="pdf",
+ help="ingest type (pdf, html, xml, etc)")
parser.add_argument('--ingest-request-source',
- default="arabesque",
- help="to include in request")
- parser.add_argument('--release-stage',
- default=None,
- help="to include in request")
+ default="arabesque",
+ help="to include in request")
+ parser.add_argument('--release-stage', default=None, help="to include in request")
parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ help="arabesque output file to use",
+ type=argparse.FileType('r'))
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
+
if __name__ == '__main__':
main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 86ca062..83c04e3 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -23,11 +23,9 @@ FORMAT_TO_MIMETYPE = {
'RAR': 'application/vnd.rar',
'TAR': 'application/x-tar',
'7z': 'application/x-7z-compressed',
-
'HTML': 'text/html',
'Text': 'text/plain',
'PDF': 'application/pdf',
-
'CSV': 'text/csv',
'XML': 'application/xml',
'JSON': 'application/json',
@@ -36,20 +34,17 @@ FORMAT_TO_MIMETYPE = {
#'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
#'application/vnd.ms-excel', # .xls
#'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
-
- 'MP3': 'audio/mpeg', # .mp3
-
- 'MP4': 'video/mp4', # .mp4
- 'MPEG': 'video/mpeg', # .mpeg
-
+ 'MP3': 'audio/mpeg', # .mp3
+ 'MP4': 'video/mp4', # .mp4
+ 'MPEG': 'video/mpeg', # .mpeg
'JPEG': 'image/jpeg',
'GIF': 'image/gif',
'PNG': 'image/png',
'TIFF': 'image/tiff',
-
'Unknown': None,
}
+
def want_file(f: dict, item_name: str) -> bool:
"""
Filters IA API files
@@ -57,12 +52,12 @@ def want_file(f: dict, item_name: str) -> bool:
if f.source != 'original':
return False
for suffix in [
- '_meta.sqlite',
- '_archive.torrent',
- '_itemimage.jpg',
- '_meta.xml',
- '_thumb.png',
- '_files.xml',
+ '_meta.sqlite',
+ '_archive.torrent',
+ '_itemimage.jpg',
+ '_meta.xml',
+ '_thumb.png',
+ '_files.xml',
]:
if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
return False
@@ -74,6 +69,7 @@ def want_file(f: dict, item_name: str) -> bool:
return False
return True
+
def parse_file(f: dict) -> dict:
"""
Takes an IA API file and turns it in to a fatcat fileset manifest file
@@ -93,6 +89,7 @@ def parse_file(f: dict) -> dict:
mf['extra'] = dict(mimetype=mimetype)
return mf
+
def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
if release_id.startswith('release_'):
@@ -104,18 +101,17 @@ def item_to_fileset(item_name: str, release_id: str, session: internetarchive.Ar
manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
fileset = {
'manifest': manifest,
- 'urls': [
- {
- 'rel': 'archive',
- 'url': f'https://archive.org/download/{item_name}/',
- },
- ],
+ 'urls': [{
+ 'rel': 'archive',
+ 'url': f'https://archive.org/download/{item_name}/',
+ }, ],
'release_ids': [release_id],
#extra={},
}
print(json.dumps(fileset))
return fileset
+
def main():
session = internetarchive.get_session()
if len(sys.argv) == 3:
@@ -133,5 +129,6 @@ def main():
release_id = fields[1]
item_to_fileset(item_name, release_id=release_id, session=session)
+
if __name__ == '__main__':
main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
index 5e33def..aa78aec 100755
--- a/python/scripts/cdx_collection.py
+++ b/python/scripts/cdx_collection.py
@@ -35,9 +35,7 @@ def run():
print("Looking up collection: {}".format(collection))
# First fetch list
- item_list = list(
- ia.search_items(
- query="collection:{} mediatype:web".format(collection)))
+ item_list = list(ia.search_items(query="collection:{} mediatype:web".format(collection)))
if len(item_list) == 0:
print("No items found, bailing")
@@ -50,11 +48,12 @@ def run():
item = item['identifier']
# TODO: error handling
try:
- ret = ia.download(item, files=[item + '.cdx.gz'],
- verbose=True,
- destdir=tempdir,
- no_directory=True,
- retries=1000)
+ ret = ia.download(item,
+ files=[item + '.cdx.gz'],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000)
status = ret and status
except requests.exceptions.ReadTimeout as rt:
print(str(rt), file=sys.stderr)
@@ -69,14 +68,13 @@ def run():
# Combine files
print("Merging and re-compressing all CDX files...")
#subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
- subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir),
- shell=True)
+ subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), shell=True)
# Move and cleanup
- shutil.move('{}/combined.gz'.format(tempdir),
- '{}.cdx.gz'.format(collection))
+ shutil.move('{}/combined.gz'.format(tempdir), '{}.cdx.gz'.format(collection))
print("Done!")
-if __name__=='__main__':
+
+if __name__ == '__main__':
run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 1b7c85c..4714b60 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Transform an unpaywall dump (JSON) into ingest requests.
"""
@@ -21,7 +20,6 @@ def transform_cnki(obj):
requests = []
assert obj['cnki_id']
-
requests = []
requests.append({
'base_url': canon(obj['info_url']),
@@ -41,6 +39,7 @@ def transform_cnki(obj):
return requests
+
def transform_wanfang(obj):
assert obj['wanfang_id']
@@ -68,17 +67,18 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('json_file',
- help="COVID-19 metadata file to use",
- type=argparse.FileType('r'))
+ help="COVID-19 metadata file to use",
+ type=argparse.FileType('r'))
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
+
if __name__ == '__main__':
main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 62a85e6..3b53235 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -49,7 +49,6 @@ def b32_hex(s):
class DeliverDumpGrobidS3():
-
def __init__(self, s3_bucket, **kwargs):
self.rstore = None
self.count = Counter()
@@ -80,11 +79,7 @@ class DeliverDumpGrobidS3():
tei_xml = tei_xml.encode('utf-8')
# upload to AWS S3
obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
Body=tei_xml,
StorageClass=self.s3_storage_class,
)
@@ -92,6 +87,7 @@ class DeliverDumpGrobidS3():
self.count['success-s3'] += 1
sys.stderr.write("{}\n".format(self.count))
+
@sentry_client.capture_exceptions
def main():
@@ -121,5 +117,6 @@ def main():
worker = DeliverDumpGrobidS3(**args.__dict__)
worker.run(args.dump_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__': # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index ab1906a..ca19b97 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -26,7 +26,6 @@ sentry_client = raven.Client()
class DeliverGwbDisk:
-
def __init__(self, disk_dir, **kwargs):
self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
self.rstore = None
@@ -34,7 +33,8 @@ class DeliverGwbDisk:
# /serve/ instead of /download/ doesn't record view count
self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
+ os.environ.get('PETABOX_WEBDATA_SECRET'))
self.disk_dir = disk_dir
self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
@@ -42,48 +42,56 @@ class DeliverGwbDisk:
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url))
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".
+ format(ve))
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".
+ format(eofe))
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason=
+ "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
+ .format(te))
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
return None, dict(status="error",
- reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason=
+ "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
+ format(ire))
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Ensuring all 65536 base directories exist...\n")
for i in range(256):
for j in range(256):
- fpath = "{}/{}{:02x}/{:02x}".format(
- self.disk_dir,
- self.disk_prefix,
- i,
- j)
+ fpath = "{}/{}{:02x}/{:02x}".format(self.disk_dir, self.disk_prefix, i, j)
os.makedirs(fpath, exist_ok=True)
sys.stderr.write("Starting...\n")
for line in manifest_file:
@@ -102,9 +110,11 @@ class DeliverGwbDisk:
self.count['skip-warc'] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
+ file_cdx['c_size'])
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+ print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
+ status['reason']))
self.count['err-petabox-fetch'] += 1
continue
elif not blob:
@@ -120,19 +130,15 @@ class DeliverGwbDisk:
self.count['petabox-ok'] += 1
# save to disk
- fpath = "{}/{}{}/{}/{}{}".format(
- self.disk_dir,
- self.disk_prefix,
- sha1_hex[0:2],
- sha1_hex[2:4],
- sha1_hex,
- self.disk_suffix)
+ fpath = "{}/{}{}/{}/{}{}".format(self.disk_dir, self.disk_prefix, sha1_hex[0:2],
+ sha1_hex[2:4], sha1_hex, self.disk_suffix)
with open(fpath, 'wb') as f:
f.write(blob)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
self.count['success-disk'] += 1
sys.stderr.write("{}\n".format(self.count))
+
@sentry_client.capture_exceptions
def main():
@@ -162,5 +168,6 @@ def main():
worker = DeliverGwbDisk(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__': # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index f103205..f9b3b19 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -53,7 +53,6 @@ sentry_client = raven.Client()
class DeliverGwbS3:
-
def __init__(self, s3_bucket, **kwargs):
self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
self.rstore = None
@@ -61,7 +60,8 @@ class DeliverGwbS3:
# /serve/ instead of /download/ doesn't record view count
self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
+ os.environ.get('PETABOX_WEBDATA_SECRET'))
self.s3_bucket = s3_bucket
self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
@@ -71,37 +71,49 @@ class DeliverGwbS3:
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
- self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
- webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ self.rstore = ResourceStore(
+ loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url))
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ )
except ValueError as ve:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".
+ format(ve))
except EOFError as eofe:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ return None, dict(
+ status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".
+ format(eofe))
except TypeError as te:
- return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ return None, dict(
+ status="error",
+ reason=
+ "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
+ .format(te))
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
return None, dict(status="error",
- reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
- return None, dict(status="error",
- reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return None, dict(
+ status="error",
+ reason=
+ "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
+ format(ire))
return raw_content, None
def run(self, manifest_file):
@@ -122,9 +134,11 @@ class DeliverGwbS3:
self.count['skip-warc'] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
+ file_cdx['c_size'])
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], status['reason']))
+ print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
+ status['reason']))
self.count['err-petabox-fetch'] += 1
continue
elif not blob:
@@ -140,17 +154,14 @@ class DeliverGwbS3:
self.count['petabox-ok'] += 1
# upload to AWS S3
- obj = self.bucket.put_object(
- Key="{}{}/{}{}".format(
- self.s3_prefix,
- sha1_hex[0:4],
- sha1_hex,
- self.s3_suffix),
- Body=blob)
+ obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4],
+ sha1_hex, self.s3_suffix),
+ Body=blob)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
self.count['success-s3'] += 1
sys.stderr.write("{}\n".format(self.count))
+
@sentry_client.capture_exceptions
def main():
@@ -180,5 +191,6 @@ def main():
worker = DeliverGwbS3(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+
+if __name__ == '__main__': # pragma: no cover
main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index 15b30a0..84a2c2c 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Transform an DOAJ article dump (JSON) into ingest requests.
@@ -42,22 +41,22 @@ CONTENT_TYPE_MAP = {
"abstract": [],
"doc": [],
"": ["pdf"],
-
"doi": ["pdf"],
"url": ["pdf"],
"fulltext": ["pdf"],
"anySimpleType": ["pdf"],
-
"application/pdf": ["pdf"],
"html": ["html", "pdf"],
"text/html": ["html", "pdf"],
"xml": ["xml"],
}
+
def canon(s: str) -> str:
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj: dict) -> List[dict]:
"""
Transforms from a single DOAJ object to zero or more ingest requests.
@@ -118,6 +117,7 @@ def transform(obj: dict) -> List[dict]:
return requests
+
def run(args) -> None:
for l in args.json_file:
if not l.strip():
@@ -128,17 +128,18 @@ def run(args) -> None:
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main() -> None:
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('json_file',
- help="DOAJ article dump file to use",
- type=argparse.FileType('r'))
+ help="DOAJ article dump file to use",
+ type=argparse.FileType('r'))
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
+
if __name__ == '__main__':
main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 3085346..54c3d5f 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -34,13 +34,13 @@ def run():
sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
- obj = dict(
- sha1=sha1,
- dois=dois,
- cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
- size=size,
- mimetype=mimetype)
+ obj = dict(sha1=sha1,
+ dois=dois,
+ cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ size=size,
+ mimetype=mimetype)
print(json.dumps(obj))
-if __name__=='__main__':
+
+if __name__ == '__main__':
run()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index d0666ce..a474393 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -24,6 +24,7 @@ NAME_DENYLIST = (
'phdstudent',
)
+
def tokenize(s, remove_whitespace=True):
s.replace('&apos;', "'")
@@ -36,9 +37,11 @@ def tokenize(s, remove_whitespace=True):
# Encode as dumb ASCII (TODO: this is horrible)
return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+
assert tokenize("Impact Factor: 2.114") == "impactfactor"
assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
+
def filter_title(title):
title = title.strip()
@@ -83,19 +86,23 @@ def filter_title(title):
return title
+
def filter_author_name(name):
name = name['name']
if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
return None
return ' '.join([t for t in name.split() if tokenize(t)])
+
def filter_authors(l):
return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
+
def filter_refs(l):
# TODO:
return l
+
def filter_journal_name(name):
# same denylist, for now
if not name:
@@ -104,10 +111,12 @@ def filter_journal_name(name):
slug_name = tokenize(name)
if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
return None
- for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
+ for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ",
+ "Research Article ", "Available online www.jocpr.com "):
if name.startswith(prefix):
name = name.replace(prefix, '')
- for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
+ for suffix in (" Available online at www.sciarena.com", " Original Article",
+ " Available online at", " ISSN", " ISSUE"):
if name.endswith(suffix):
name = name.replace(suffix, '')
if "====================" in name:
@@ -116,6 +125,7 @@ def filter_journal_name(name):
return None
return ' '.join(name.split())
+
def filter_metadata(obj):
if not (obj.get('title') and obj.get('authors')):
return None
@@ -132,6 +142,7 @@ def filter_metadata(obj):
return obj
+
def run(invert=False):
for line in sys.stdin:
fields = line.split('\t')
@@ -155,5 +166,6 @@ def run(invert=False):
elif invert:
print(raw.strip())
-if __name__=="__main__":
+
+if __name__ == "__main__":
run(invert="--invert" in sys.argv)
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index 494da71..fda9098 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -28,6 +28,7 @@ MAX_SLUG_LINES = 50
REQUIRE_AUTHORS = False
+
def tokenize(s, remove_whitespace=False):
s.replace('&apos;', "'")
@@ -40,6 +41,7 @@ def tokenize(s, remove_whitespace=False):
# Encode as dumb ASCII (TODO: this is horrible)
return s.encode('ascii', 'replace').replace(b'?', b'')
+
def check_authors(left, right):
"""
Intended to check GROBID extracted authors (right) against "known good"
@@ -63,6 +65,7 @@ def check_authors(left, right):
return False
return True
+
def test_check_authors():
assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
assert not check_authors([], ['one'])
@@ -74,6 +77,7 @@ def test_check_authors():
assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+
# Rows are (score, left, right)
def process_group(rows):
@@ -119,6 +123,7 @@ def process_group(rows):
print(json.dumps([releases[ident] for ident in group_ids]))
+
def run():
last_slug = None
@@ -140,5 +145,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == '__main__':
run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index abf81bd..3251852 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -33,6 +33,7 @@ def tokenize(s, remove_whitespace=False):
# Encode as dumb ASCII (TODO: this is horrible)
return s.encode('ascii', 'replace').replace(b'?', b'')
+
def check_authors(left, right):
"""
Intended to check GROBID extracted authors (right) against "known good"
@@ -56,6 +57,7 @@ def check_authors(left, right):
return False
return True
+
def test_check_authors():
assert not check_authors([], [])
assert not check_authors([], ['one'])
@@ -67,6 +69,7 @@ def test_check_authors():
assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+
# Rows are (score, grobid, crossref)
def process_group(rows):
if len(rows) > max_slug_lines:
@@ -92,6 +95,7 @@ def process_group(rows):
for sha1, doi_list in keepers.items():
print("{}\t{}".format(sha1, json.dumps(doi_list)))
+
def run():
last_slug = None
@@ -112,5 +116,6 @@ def run():
if lines:
process_group(lines)
-if __name__=='__main__':
+
+if __name__ == '__main__':
run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index d391f60..b42153c 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Takes old (HBase) or new (pg) style JSON wrappers of GROBID XML extraction
output, converts the XML to JSON, filters out raw affiliation strings, and
@@ -24,10 +23,12 @@ def parse_hbase(line):
tei_xml = obj['tei_xml']
return sha1hex, tei_xml
+
def parse_pg(line):
obj = json.loads(line)
return obj['sha1hex'], obj['tei_xml']
+
def run(mode='hbase'):
for line in sys.stdin:
if mode == 'hbase':
@@ -49,5 +50,6 @@ def run(mode='hbase'):
affiliations = [json.loads(a) for a in affiliations]
print('\t'.join([sha1hex, json.dumps(affiliations)]))
-if __name__=='__main__':
+
+if __name__ == '__main__':
run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index 8aee0be..c9bc134 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -4,7 +4,8 @@ import datetime
import json
import sys
-MAX_ABSTRACT_BYTES=4096
+MAX_ABSTRACT_BYTES = 4096
+
def parse_grobid_json(obj):
@@ -14,10 +15,7 @@ def parse_grobid_json(obj):
extra = dict()
if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
- abobj = dict(
- mimetype="text/plain",
- language=None,
- content=obj.get('abstract').strip())
+ abobj = dict(mimetype="text/plain", language=None, content=obj.get('abstract').strip())
abstracts = [abobj]
else:
abstracts = None
@@ -72,16 +70,16 @@ def parse_grobid_json(obj):
else:
extra = None
- return dict(
- title=obj['title'].strip(),
- contribs=contribs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
- abstracts=abstracts,
- release_type=release_type,
- release_date=release_date,
- extra=extra)
+ return dict(title=obj['title'].strip(),
+ contribs=contribs,
+ publisher=obj['journal'].get('publisher'),
+ volume=obj['journal'].get('volume'),
+ issue=obj['journal'].get('issue'),
+ abstracts=abstracts,
+ release_type=release_type,
+ release_date=release_date,
+ extra=extra)
+
def run():
for line in sys.stdin:
@@ -90,5 +88,6 @@ def run():
if out:
print(out)
-if __name__=="__main__":
+
+if __name__ == "__main__":
run()
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index acba2a8..70731d5 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
This script is used to turn ingest request postgres rows (in JSON export
format) back in to regular ingest request JSON.
@@ -25,6 +24,7 @@ def transform(row):
row['fatcat'] = dict(release_ident=extra['release_ident'])
return row
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -35,17 +35,18 @@ def run(args):
print(l, file=sys.stderr)
print(json.dumps(req, sort_keys=True))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ help="arabesque output file to use",
+ type=argparse.FileType('r'))
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
+
if __name__ == '__main__':
main()
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 8267003..24e22fd 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -20,6 +20,7 @@ import sys
# 2. select all file metadata
# 3. output object
+
def or_none(s):
if s is None:
return None
@@ -27,6 +28,7 @@ def or_none(s):
return None
return s
+
def process_db(db_path):
db = sqlite3.connect(db_path)
@@ -52,5 +54,6 @@ def process_db(db_path):
dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1])
print(json.dumps(obj))
-if __name__=="__main__":
+
+if __name__ == "__main__":
process_db(sys.argv[1])
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 315b8d2..1f4a19f 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Transform an OAI-PMH bulk dump (JSON) into ingest requests.
@@ -33,17 +32,19 @@ DOMAIN_BLOCKLIST = [
]
RELEASE_STAGE_MAP = {
- 'info:eu-repo/semantics/draftVersion': 'draft',
+ 'info:eu-repo/semantics/draftVersion': 'draft',
'info:eu-repo/semantics/submittedVersion': 'submitted',
- 'info:eu-repo/semantics/acceptedVersion': 'accepted',
+ 'info:eu-repo/semantics/acceptedVersion': 'accepted',
'info:eu-repo/semantics/publishedVersion': 'published',
- 'info:eu-repo/semantics/updatedVersion': 'updated',
+ 'info:eu-repo/semantics/updatedVersion': 'updated',
}
+
def canon(s):
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj):
"""
Transforms from a single OAI-PMH object to zero or more ingest requests.
@@ -112,6 +113,7 @@ def transform(obj):
return requests
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -122,17 +124,18 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('json_file',
- help="OAI-PMH dump file to use (usually stdin)",
- type=argparse.FileType('r'))
+ help="OAI-PMH dump file to use (usually stdin)",
+ type=argparse.FileType('r'))
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
+
if __name__ == '__main__':
main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index 71fbe54..3f81b3b 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Quick CLI script to convert a PDF to thumbnail (.png, jpeg, etc).
@@ -23,12 +22,14 @@ def run(inpath, outpath):
renderer = poppler.PageRenderer()
full_page = renderer.render_page(page)
- img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "BGRA", 0, 1)
- img.thumbnail((180,300), Image.BICUBIC)
+ img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw',
+ "BGRA", 0, 1)
+ img.thumbnail((180, 300), Image.BICUBIC)
#img.thumbnail((360,600), Image.BICUBIC)
img.save(outpath)
#img.save(outpath, quality=95)
+
if __name__ == '__main__':
if len(sys.argv) != 3:
print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 590b429..b79f316 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Transform an unpaywall dump (JSON) into ingest requests.
"""
@@ -26,17 +25,19 @@ DOMAIN_BLOCKLIST = [
]
RELEASE_STAGE_MAP = {
- 'draftVersion': 'draft',
+ 'draftVersion': 'draft',
'submittedVersion': 'submitted',
- 'acceptedVersion': 'accepted',
+ 'acceptedVersion': 'accepted',
'publishedVersion': 'published',
- 'updatedVersion': 'updated',
+ 'updatedVersion': 'updated',
}
+
def canon(s):
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj):
"""
Transforms from a single unpaywall object to zero or more ingest requests.
@@ -86,6 +87,7 @@ def transform(obj):
return requests
+
def run(args):
for l in args.json_file:
if not l.strip():
@@ -96,17 +98,18 @@ def run(args):
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('json_file',
- help="unpaywall dump file to use",
- type=argparse.FileType('r'))
+ help="unpaywall dump file to use",
+ type=argparse.FileType('r'))
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
+
if __name__ == '__main__':
main()