aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
commit826c7538e091fac14d987a3cd654975da964e240 (patch)
tree90345b4cabb461c624ca5a218c2fc01dce3055cd /python/scripts
parent020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
downloadsandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz
sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip
make fmt (black 21.9b0)
Diffstat (limited to 'python/scripts')
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py57
-rwxr-xr-xpython/scripts/archiveorg_fileset.py99
-rwxr-xr-xpython/scripts/cdx_collection.py26
-rwxr-xr-xpython/scripts/covid2ingestrequest.py66
-rwxr-xr-xpython/scripts/deliver_dumpgrobid_to_s3.py67
-rwxr-xr-xpython/scripts/deliver_gwb_to_disk.py154
-rwxr-xr-xpython/scripts/deliver_gwb_to_s3.py148
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py64
-rwxr-xr-xpython/scripts/enrich_scored_matches.py20
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py99
-rwxr-xr-xpython/scripts/filter_groupworks.py40
-rwxr-xr-xpython/scripts/filter_scored_matches.py42
-rwxr-xr-xpython/scripts/grobid_affiliations.py24
-rwxr-xr-xpython/scripts/import_grobid_metadata.py64
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py18
-rwxr-xr-xpython/scripts/oai2ingestrequest.py64
-rwxr-xr-xpython/scripts/pdf_thumbnail.py11
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py63
18 files changed, 601 insertions, 525 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 9cc9055..4561541 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -21,43 +21,48 @@ def run(args):
if not l.strip():
continue
row = json.loads(l)
- if not row['hit']:
+ if not row["hit"]:
continue
request = {
- 'base_url': row['final_url'],
- 'ingest_type': args.ingest_type,
- 'link_source': args.link_source,
- 'link_source_id': row['identifier'],
- 'ingest_request_source': args.ingest_request_source,
- 'ext_ids': {
- args.extid_type: row['identifier'],
+ "base_url": row["final_url"],
+ "ingest_type": args.ingest_type,
+ "link_source": args.link_source,
+ "link_source_id": row["identifier"],
+ "ingest_request_source": args.ingest_request_source,
+ "ext_ids": {
+ args.extid_type: row["identifier"],
},
}
if args.release_stage:
- assert args.release_stage in ('published', 'submitted', 'accepted', 'draft',
- 'update')
- request['release_stage'] = args.release_stage
+ assert args.release_stage in (
+ "published",
+ "submitted",
+ "accepted",
+ "draft",
+ "update",
+ )
+ request["release_stage"] = args.release_stage
print("{}".format(json.dumps(request, sort_keys=True)))
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('--link-source',
- required=True,
- help="link_source to include in request")
- parser.add_argument('--extid-type', required=True, help="extid to encode identifier as")
- parser.add_argument('--ingest-type',
- default="pdf",
- help="ingest type (pdf, html, xml, etc)")
- parser.add_argument('--ingest-request-source',
- default="arabesque",
- help="to include in request")
- parser.add_argument('--release-stage', default=None, help="to include in request")
- parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--link-source", required=True, help="link_source to include in request"
+ )
+ parser.add_argument("--extid-type", required=True, help="extid to encode identifier as")
+ parser.add_argument(
+ "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)"
+ )
+ parser.add_argument(
+ "--ingest-request-source", default="arabesque", help="to include in request"
+ )
+ parser.add_argument("--release-stage", default=None, help="to include in request")
+ parser.add_argument(
+ "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
@@ -65,5 +70,5 @@ def main():
run(args)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 83c04e3..6328f52 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -16,32 +16,31 @@ from typing import Any
import internetarchive
FORMAT_TO_MIMETYPE = {
- 'BZIP': 'application/x-bzip',
- 'BZIP2': 'application/x-bzip2',
- 'ZIP': 'application/zip',
- 'GZIP': 'application/gzip',
- 'RAR': 'application/vnd.rar',
- 'TAR': 'application/x-tar',
- '7z': 'application/x-7z-compressed',
- 'HTML': 'text/html',
- 'Text': 'text/plain',
- 'PDF': 'application/pdf',
- 'CSV': 'text/csv',
- 'XML': 'application/xml',
- 'JSON': 'application/json',
-
+ "BZIP": "application/x-bzip",
+ "BZIP2": "application/x-bzip2",
+ "ZIP": "application/zip",
+ "GZIP": "application/gzip",
+ "RAR": "application/vnd.rar",
+ "TAR": "application/x-tar",
+ "7z": "application/x-7z-compressed",
+ "HTML": "text/html",
+ "Text": "text/plain",
+ "PDF": "application/pdf",
+ "CSV": "text/csv",
+ "XML": "application/xml",
+ "JSON": "application/json",
#'application/msword (.doc)', # .doc
#'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx
#'application/vnd.ms-excel', # .xls
#'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx
- 'MP3': 'audio/mpeg', # .mp3
- 'MP4': 'video/mp4', # .mp4
- 'MPEG': 'video/mpeg', # .mpeg
- 'JPEG': 'image/jpeg',
- 'GIF': 'image/gif',
- 'PNG': 'image/png',
- 'TIFF': 'image/tiff',
- 'Unknown': None,
+ "MP3": "audio/mpeg", # .mp3
+ "MP4": "video/mp4", # .mp4
+ "MPEG": "video/mpeg", # .mpeg
+ "JPEG": "image/jpeg",
+ "GIF": "image/gif",
+ "PNG": "image/png",
+ "TIFF": "image/tiff",
+ "Unknown": None,
}
@@ -49,22 +48,22 @@ def want_file(f: dict, item_name: str) -> bool:
"""
Filters IA API files
"""
- if f.source != 'original':
+ if f.source != "original":
return False
for suffix in [
- '_meta.sqlite',
- '_archive.torrent',
- '_itemimage.jpg',
- '_meta.xml',
- '_thumb.png',
- '_files.xml',
+ "_meta.sqlite",
+ "_archive.torrent",
+ "_itemimage.jpg",
+ "_meta.xml",
+ "_thumb.png",
+ "_files.xml",
]:
if f.name == item_name + suffix or f.name == item_name.lower() + suffix:
return False
- if f.name.startswith('_'):
+ if f.name.startswith("_"):
return False
- if item_name.startswith('academictorrents_'):
- for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']:
+ if item_name.startswith("academictorrents_"):
+ for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]:
if f.name == item_name + suffix:
return False
return True
@@ -77,36 +76,38 @@ def parse_file(f: dict) -> dict:
assert f.name and f.sha1 and f.md5
assert f.name is not None
mf = {
- 'path': f.name,
- 'size': int(f.size),
- 'sha1': f.sha1,
- 'md5': f.md5,
+ "path": f.name,
+ "size": int(f.size),
+ "sha1": f.sha1,
+ "md5": f.md5,
}
# TODO: will disable this hard check eventually and replace with:
- #mimetype = FORMAT_TO_MIMETYPE.get(f.format)
+ # mimetype = FORMAT_TO_MIMETYPE.get(f.format)
mimetype = FORMAT_TO_MIMETYPE[f.format]
if mimetype:
- mf['extra'] = dict(mimetype=mimetype)
+ mf["extra"] = dict(mimetype=mimetype)
return mf
def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession):
print(f"processing item={item_name} release_id={release_id}", file=sys.stderr)
- if release_id.startswith('release_'):
+ if release_id.startswith("release_"):
release_id = release_id[9:]
assert len(release_id) == 26
item = session.get_item(item_name)
- assert item.metadata['mediatype'] not in ['collection', 'web']
+ assert item.metadata["mediatype"] not in ["collection", "web"]
item_files = item.get_files(on_the_fly=False)
manifest = [parse_file(f) for f in item_files if want_file(f, item_name)]
fileset = {
- 'manifest': manifest,
- 'urls': [{
- 'rel': 'archive',
- 'url': f'https://archive.org/download/{item_name}/',
- }, ],
- 'release_ids': [release_id],
- #extra={},
+ "manifest": manifest,
+ "urls": [
+ {
+ "rel": "archive",
+ "url": f"https://archive.org/download/{item_name}/",
+ },
+ ],
+ "release_ids": [release_id],
+ # extra={},
}
print(json.dumps(fileset))
return fileset
@@ -123,12 +124,12 @@ def main():
line = line.strip()
if not line:
continue
- fields = line.split('\t')
+ fields = line.split("\t")
assert len(fields) == 2
item_name = fields[0]
release_id = fields[1]
item_to_fileset(item_name, release_id=release_id, session=session)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
index aa78aec..0b60da3 100755
--- a/python/scripts/cdx_collection.py
+++ b/python/scripts/cdx_collection.py
@@ -29,7 +29,7 @@ def run():
collection = sys.argv[1]
# Check collection name is clean
- assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum()
+ assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum()
tempdir = tempfile.mkdtemp()
print("Looking up collection: {}".format(collection))
@@ -45,15 +45,17 @@ def run():
status = True
errors = []
for item in item_list:
- item = item['identifier']
+ item = item["identifier"]
# TODO: error handling
try:
- ret = ia.download(item,
- files=[item + '.cdx.gz'],
- verbose=True,
- destdir=tempdir,
- no_directory=True,
- retries=1000)
+ ret = ia.download(
+ item,
+ files=[item + ".cdx.gz"],
+ verbose=True,
+ destdir=tempdir,
+ no_directory=True,
+ retries=1000,
+ )
status = ret and status
except requests.exceptions.ReadTimeout as rt:
print(str(rt), file=sys.stderr)
@@ -67,14 +69,14 @@ def run():
# Combine files
print("Merging and re-compressing all CDX files...")
- #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
- subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), shell=True)
+ # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir),
+ subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True)
# Move and cleanup
- shutil.move('{}/combined.gz'.format(tempdir), '{}.cdx.gz'.format(collection))
+ shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection))
print("Done!")
-if __name__ == '__main__':
+if __name__ == "__main__":
run()
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 4714b60..e3bf4f0 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -18,38 +18,44 @@ def canon(s):
def transform_cnki(obj):
requests = []
- assert obj['cnki_id']
+ assert obj["cnki_id"]
requests = []
- requests.append({
- 'base_url': canon(obj['info_url']),
- 'ingest_type': 'pdf',
- 'link_source': 'cnki_covid19',
- 'link_source_id': obj['cnki_id'],
- 'ingest_request_source': 'scrape-covid19',
- })
- if 'read_url' in obj:
- requests.append({
- 'base_url': canon(obj['read_url']),
- 'ingest_type': 'pdf', # actually HTML
- 'link_source': 'cnki_covid19',
- 'link_source_id': obj['cnki_id'],
- 'ingest_request_source': 'scrape-covid19',
- })
+ requests.append(
+ {
+ "base_url": canon(obj["info_url"]),
+ "ingest_type": "pdf",
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
+ if "read_url" in obj:
+ requests.append(
+ {
+ "base_url": canon(obj["read_url"]),
+ "ingest_type": "pdf", # actually HTML
+ "link_source": "cnki_covid19",
+ "link_source_id": obj["cnki_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ )
return requests
def transform_wanfang(obj):
- assert obj['wanfang_id']
- return [{
- 'base_url': canon(obj['url']),
- 'ingest_type': 'pdf',
- 'link_source': 'wanfang_covid19',
- 'link_source_id': obj['wanfang_id'],
- 'ingest_request_source': 'scrape-covid19',
- }]
+ assert obj["wanfang_id"]
+ return [
+ {
+ "base_url": canon(obj["url"]),
+ "ingest_type": "pdf",
+ "link_source": "wanfang_covid19",
+ "link_source_id": obj["wanfang_id"],
+ "ingest_request_source": "scrape-covid19",
+ }
+ ]
def run(args):
@@ -58,9 +64,9 @@ def run(args):
continue
row = json.loads(l)
- if 'wanfang_id' in row:
+ if "wanfang_id" in row:
requests = transform_wanfang(row) or []
- elif 'cnki_id' in row:
+ elif "cnki_id" in row:
requests = transform_cnki(row) or []
else:
continue
@@ -70,9 +76,9 @@ def run(args):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="COVID-19 metadata file to use",
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
@@ -80,5 +86,5 @@ def main():
run(args)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 3b53235..3c769cf 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -45,38 +45,38 @@ def b32_hex(s):
s = s[5:]
if len(s) != 32:
return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
-class DeliverDumpGrobidS3():
+class DeliverDumpGrobidS3:
def __init__(self, s3_bucket, **kwargs):
self.rstore = None
self.count = Counter()
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'grobid/')
- self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml')
- self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "grobid/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml")
+ self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def run(self, dump_file):
sys.stderr.write("Starting...\n")
for line in dump_file:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, grobid_json = line[0], line[1]
if len(sha1_hex) != 40:
sha1_hex = b32_hex(sha1_hex)
assert len(sha1_hex) == 40
grobid = json.loads(grobid_json)
- tei_xml = grobid.get('tei_xml')
+ tei_xml = grobid.get("tei_xml")
if not tei_xml:
print("{}\tskip empty".format(sha1_hex))
- self.count['skip-empty'] += 1
+ self.count["skip-empty"] += 1
continue
- tei_xml = tei_xml.encode('utf-8')
+ tei_xml = tei_xml.encode("utf-8")
# upload to AWS S3
obj = self.bucket.put_object(
Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
@@ -84,7 +84,7 @@ class DeliverDumpGrobidS3():
StorageClass=self.s3_storage_class,
)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
@@ -92,31 +92,32 @@ class DeliverDumpGrobidS3():
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="grobid/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".tei.xml",
- help='file suffix for created objects')
- parser.add_argument('--s3-storage-class',
- type=str,
- default="STANDARD",
- help='AWS S3 storage class (redundancy) to use')
- parser.add_argument('dump_file',
- help="TSV/JSON dump file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix",
+ type=str,
+ default="grobid/",
+ help="key prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--s3-storage-class",
+ type=str,
+ default="STANDARD",
+ help="AWS S3 storage class (redundancy) to use",
+ )
+ parser.add_argument(
+ "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r")
+ )
args = parser.parse_args()
worker = DeliverDumpGrobidS3(**args.__dict__)
worker.run(args.dump_file)
-if __name__ == '__main__': # pragma: no cover
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index ca19b97..fcaf51f 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -27,64 +27,76 @@ sentry_client = raven.Client()
class DeliverGwbDisk:
def __init__(self, disk_dir, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
- os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.disk_dir = disk_dir
- self.disk_prefix = kwargs.get('disk_prefix', 'pdf/')
- self.disk_suffix = kwargs.get('disk_suffix', '.pdf')
+ self.disk_prefix = kwargs.get("disk_prefix", "pdf/")
+ self.disk_suffix = kwargs.get("disk_suffix", ".pdf")
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
self.rstore = ResourceStore(
- loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
return None, dict(
status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
)
except ValueError as ve:
return None, dict(
status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".
- format(ve))
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
return None, dict(
status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".
- format(eofe))
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
return None, dict(
status="error",
- reason=
- "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
- .format(te))
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
- reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ return None, dict(
+ status="error",
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
return None, dict(
status="error",
- reason=
- "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
- format(ire))
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
@@ -95,47 +107,57 @@ class DeliverGwbDisk:
os.makedirs(fpath, exist_ok=True)
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
- file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
- status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# save to disk
- fpath = "{}/{}{}/{}/{}{}".format(self.disk_dir, self.disk_prefix, sha1_hex[0:2],
- sha1_hex[2:4], sha1_hex, self.disk_suffix)
- with open(fpath, 'wb') as f:
+ fpath = "{}/{}{}/{}/{}{}".format(
+ self.disk_dir,
+ self.disk_prefix,
+ sha1_hex[0:2],
+ sha1_hex[2:4],
+ sha1_hex,
+ self.disk_suffix,
+ )
+ with open(fpath, "wb") as f:
f.write(blob)
print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob)))
- self.count['success-disk'] += 1
+ self.count["success-disk"] += 1
sys.stderr.write("{}\n".format(self.count))
@@ -143,31 +165,35 @@ class DeliverGwbDisk:
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--disk-dir',
- required=True,
- type=str,
- help='local base directory to save into')
- parser.add_argument('--disk-prefix',
- type=str,
- default="pdf/",
- help='directory prefix for items created in bucket')
- parser.add_argument('--disk-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created files')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--disk-dir", required=True, type=str, help="local base directory to save into"
+ )
+ parser.add_argument(
+ "--disk-prefix",
+ type=str,
+ default="pdf/",
+ help="directory prefix for items created in bucket",
+ )
+ parser.add_argument(
+ "--disk-suffix", type=str, default=".pdf", help="file suffix for created files"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
worker = DeliverGwbDisk(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index f9b3b19..1f08c4f 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -54,111 +54,128 @@ sentry_client = raven.Client()
class DeliverGwbS3:
def __init__(self, s3_bucket, **kwargs):
- self.warc_uri_prefix = kwargs.get('warc_uri_prefix')
+ self.warc_uri_prefix = kwargs.get("warc_uri_prefix")
self.rstore = None
self.count = Counter()
# /serve/ instead of /download/ doesn't record view count
- self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/')
+ self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/")
# gwb library will fall back to reading from /opt/.petabox/webdata.secret
- self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret',
- os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.petabox_webdata_secret = kwargs.get(
+ "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET")
+ )
self.s3_bucket = s3_bucket
- self.s3_prefix = kwargs.get('s3_prefix', 'pdf/')
- self.s3_suffix = kwargs.get('s3_suffix', '.pdf')
- self.s3 = boto3.resource('s3')
+ self.s3_prefix = kwargs.get("s3_prefix", "pdf/")
+ self.s3_suffix = kwargs.get("s3_suffix", ".pdf")
+ self.s3 = boto3.resource("s3")
self.bucket = self.s3.Bucket(self.s3_bucket)
def fetch_warc_content(self, warc_path, offset, c_size):
warc_uri = self.warc_uri_prefix + warc_path
if not self.rstore:
self.rstore = ResourceStore(
- loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret,
- download_base_url=self.petabox_base_url))
+ loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.petabox_base_url,
+ )
+ )
try:
gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
return None, dict(
status="error",
- reason="failed to load file contents from wayback/petabox (ResourceUnavailable)"
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)",
)
except ValueError as ve:
return None, dict(
status="error",
- reason="failed to load file contents from wayback/petabox (ValueError: {})".
- format(ve))
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(
+ ve
+ ),
+ )
except EOFError as eofe:
return None, dict(
status="error",
- reason="failed to load file contents from wayback/petabox (EOFError: {})".
- format(eofe))
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(
+ eofe
+ ),
+ )
except TypeError as te:
return None, dict(
status="error",
- reason=
- "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)"
- .format(te))
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(
+ te
+ ),
+ )
# Note: could consider a generic "except Exception" here, as we get so
# many petabox errors. Do want jobs to fail loud and clear when the
# whole cluster is down though.
if gwb_record.get_status()[0] != 200:
- return None, dict(status="error",
- reason="archived HTTP response (WARC) was not 200",
- warc_status=gwb_record.get_status()[0])
+ return None, dict(
+ status="error",
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0],
+ )
try:
raw_content = gwb_record.open_raw_content().read()
except IncompleteRead as ire:
return None, dict(
status="error",
- reason=
- "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".
- format(ire))
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(
+ ire
+ ),
+ )
return raw_content, None
def run(self, manifest_file):
sys.stderr.write("Starting...\n")
for line in manifest_file:
- self.count['total'] += 1
- line = line.strip().split('\t')
+ self.count["total"] += 1
+ line = line.strip().split("\t")
if len(line) != 2:
- self.count['skip-line'] += 1
+ self.count["skip-line"] += 1
continue
sha1_hex, cdx_json = line[0], line[1]
assert len(sha1_hex) == 40
file_cdx = json.loads(cdx_json)
# If warc is not item/file.(w)arc.gz form, skip it
- if len(file_cdx['warc'].split('/')) != 2:
- sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc']))
- print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc']))
- self.count['skip-warc'] += 1
+ if len(file_cdx["warc"].split("/")) != 2:
+ sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"]))
+ print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"]))
+ self.count["skip-warc"] += 1
continue
# fetch from GWB/petabox via HTTP range-request
- blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'],
- file_cdx['c_size'])
+ blob, status = self.fetch_warc_content(
+ file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"]
+ )
if blob is None and status:
- print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'],
- status['reason']))
- self.count['err-petabox-fetch'] += 1
+ print(
+ "{}\terror petabox\t{}\t{}".format(
+ sha1_hex, file_cdx["warc"], status["reason"]
+ )
+ )
+ self.count["err-petabox-fetch"] += 1
continue
elif not blob:
print("{}\tskip-empty-blob".format(sha1_hex))
- self.count['skip-empty-blob'] += 1
+ self.count["skip-empty-blob"] += 1
continue
# verify sha1
if sha1_hex != hashlib.sha1(blob).hexdigest():
- #assert sha1_hex == hashlib.sha1(blob).hexdigest()
- #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
+ # assert sha1_hex == hashlib.sha1(blob).hexdigest()
+ # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex))
print("{}\terror petabox-hash-mismatch".format(sha1_hex))
- self.count['err-petabox-hash-mismatch'] += 1
+ self.count["err-petabox-hash-mismatch"] += 1
- self.count['petabox-ok'] += 1
+ self.count["petabox-ok"] += 1
# upload to AWS S3
- obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4],
- sha1_hex, self.s3_suffix),
- Body=blob)
+ obj = self.bucket.put_object(
+ Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix),
+ Body=blob,
+ )
print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob)))
- self.count['success-s3'] += 1
+ self.count["success-s3"] += 1
sys.stderr.write("{}\n".format(self.count))
@@ -166,31 +183,32 @@ class DeliverGwbS3:
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('--s3-bucket',
- required=True,
- type=str,
- help='AWS S3 bucket to upload into')
- parser.add_argument('--s3-prefix',
- type=str,
- default="pdf/",
- help='key prefix for items created in bucket')
- parser.add_argument('--s3-suffix',
- type=str,
- default=".pdf",
- help='file suffix for created objects')
- parser.add_argument('--warc-uri-prefix',
- type=str,
- default='https://archive.org/serve/',
- help='URI where WARCs can be found')
- parser.add_argument('manifest_file',
- help="TSV/JSON manifest file",
- default=sys.stdin,
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into"
+ )
+ parser.add_argument(
+ "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket"
+ )
+ parser.add_argument(
+ "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects"
+ )
+ parser.add_argument(
+ "--warc-uri-prefix",
+ type=str,
+ default="https://archive.org/serve/",
+ help="URI where WARCs can be found",
+ )
+ parser.add_argument(
+ "manifest_file",
+ help="TSV/JSON manifest file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ )
args = parser.parse_args()
worker = DeliverGwbS3(**args.__dict__)
worker.run(args.manifest_file)
-if __name__ == '__main__': # pragma: no cover
+if __name__ == "__main__": # pragma: no cover
main()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index 84a2c2c..67286b9 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -17,23 +17,21 @@ import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
- #"semanticscholar.org/",
+ # "semanticscholar.org/",
"://doi.org/",
"zenodo.org/",
"figshare.com/",
"://archive.org/",
".archive.org/",
-
# large publishers/platforms; may remove in the future
- #"://link.springer.com/",
- #"://dergipark.gov.tr/",
- #"frontiersin.org/",
- #"scielo",
+ # "://link.springer.com/",
+ # "://dergipark.gov.tr/",
+ # "frontiersin.org/",
+ # "scielo",
]
# these default to PDF; note that we also do pdf ingests for HTML pages
@@ -63,35 +61,35 @@ def transform(obj: dict) -> List[dict]:
Returns a list of dicts.
"""
- doaj_id = obj['id'].lower()
+ doaj_id = obj["id"].lower()
assert doaj_id
- bibjson = obj['bibjson']
- if not bibjson['link']:
+ bibjson = obj["bibjson"]
+ if not bibjson["link"]:
return []
requests = []
doi: Optional[str] = None
- for ident in (bibjson['identifier'] or []):
- if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'):
- doi = ident['id'].lower()
+ for ident in bibjson["identifier"] or []:
+ if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
+ doi = ident["id"].lower()
- for link in (bibjson['link'] or []):
- if link.get('type') != "fulltext" or not link.get('url'):
+ for link in bibjson["link"] or []:
+ if link.get("type") != "fulltext" or not link.get("url"):
continue
- ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower())
+ ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
if not ingest_types:
continue
skip = False
for domain in DOMAIN_BLOCKLIST:
- if domain in link['url'].lower():
+ if domain in link["url"].lower():
skip = True
if skip:
continue
try:
- base_url = canon(link['url'].strip())
+ base_url = canon(link["url"].strip())
except UnicodeEncodeError:
continue
@@ -100,18 +98,18 @@ def transform(obj: dict) -> List[dict]:
for ingest_type in ingest_types:
request = {
- 'base_url': base_url,
- 'ingest_type': ingest_type,
- 'link_source': 'doaj',
- 'link_source_id': doaj_id,
- 'ingest_request_source': 'doaj',
- 'release_stage': 'published',
- 'rel': 'publisher',
- 'ext_ids': {
- 'doi': doi,
- 'doaj': doaj_id,
+ "base_url": base_url,
+ "ingest_type": ingest_type,
+ "link_source": "doaj",
+ "link_source_id": doaj_id,
+ "ingest_request_source": "doaj",
+ "release_stage": "published",
+ "rel": "publisher",
+ "ext_ids": {
+ "doi": doi,
+ "doaj": doaj_id,
},
- 'edit_extra': {},
+ "edit_extra": {},
}
requests.append(request)
@@ -131,9 +129,9 @@ def run(args) -> None:
def main() -> None:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="DOAJ article dump file to use",
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
@@ -141,5 +139,5 @@ def main() -> None:
run(args)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 54c3d5f..44c091c 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -24,23 +24,25 @@ import sys
def run():
for line in sys.stdin:
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 5
- raw_sha1 = line[0].replace('sha1:', '')
+ raw_sha1 = line[0].replace("sha1:", "")
dois = json.loads(line[1])
cdx = json.loads(line[2])
mimetype = line[3]
size = int(line[4])
- sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()
- obj = dict(sha1=sha1,
- dois=dois,
- cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
- size=size,
- mimetype=mimetype)
+ obj = dict(
+ sha1=sha1,
+ dois=dois,
+ cdx=[dict(url=cdx["url"], dt=cdx["dt"])],
+ size=size,
+ mimetype=mimetype,
+ )
print(json.dumps(obj))
-if __name__ == '__main__':
+if __name__ == "__main__":
run()
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index a474393..8fce0d9 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -3,39 +3,41 @@
import json
import sys
-with open('title_slug_denylist.txt', 'r') as f:
+with open("title_slug_denylist.txt", "r") as f:
TITLE_DENYLIST = [l.strip() for l in f]
-TITLE_DENYLIST.extend((
- 'editorial',
- 'advertisement',
- 'bookreviews',
- 'reviews',
- 'nr',
- 'abstractoriginalarticle',
- 'originalarticle',
- 'impactfactor',
- 'articlenumber',
-))
+TITLE_DENYLIST.extend(
+ (
+ "editorial",
+ "advertisement",
+ "bookreviews",
+ "reviews",
+ "nr",
+ "abstractoriginalarticle",
+ "originalarticle",
+ "impactfactor",
+ "articlenumber",
+ )
+)
# The full name can't *entirely* be one of these
NAME_DENYLIST = (
- 'phd',
- 'phdstudent',
+ "phd",
+ "phdstudent",
)
def tokenize(s, remove_whitespace=True):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
+ return s.encode("ascii", "replace").decode("utf8").replace("?", "")
assert tokenize("Impact Factor: 2.114") == "impactfactor"
@@ -50,14 +52,14 @@ def filter_title(title):
title_slug = tokenize(title, remove_whitespace=True)
if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
return None
- if title_slug.startswith('nr'):
+ if title_slug.startswith("nr"):
return None
- if title.lower().replace('.', '').startswith('int j '):
+ if title.lower().replace(".", "").startswith("int j "):
return None
for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
if title.startswith(prefix):
- title.replace(prefix, '')
+ title.replace(prefix, "")
if title.startswith("The Journal of "):
return None
@@ -81,17 +83,17 @@ def filter_title(title):
return None
# too deep subtitling/splitting
- if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
+ if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1:
return None
return title
def filter_author_name(name):
- name = name['name']
- if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
+ name = name["name"]
+ if name.strip().lower().replace(" ", "") in NAME_DENYLIST:
return None
- return ' '.join([t for t in name.split() if tokenize(t)])
+ return " ".join([t for t in name.split() if tokenize(t)])
def filter_authors(l):
@@ -107,45 +109,58 @@ def filter_journal_name(name):
# same denylist, for now
if not name:
return None
- name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
+ name = name.replace(" e-ISSN", "").replace(" p-ISSN", "")
slug_name = tokenize(name)
if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
return None
- for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ",
- "Research Article ", "Available online www.jocpr.com "):
+ for prefix in (
+ "/ ",
+ "~ ",
+ "& ",
+ "© ",
+ "Original Research Article ",
+ "Original Article ",
+ "Research Article ",
+ "Available online www.jocpr.com ",
+ ):
if name.startswith(prefix):
- name = name.replace(prefix, '')
- for suffix in (" Available online at www.sciarena.com", " Original Article",
- " Available online at", " ISSN", " ISSUE"):
+ name = name.replace(prefix, "")
+ for suffix in (
+ " Available online at www.sciarena.com",
+ " Original Article",
+ " Available online at",
+ " ISSN",
+ " ISSUE",
+ ):
if name.endswith(suffix):
- name = name.replace(suffix, '')
+ name = name.replace(suffix, "")
if "====================" in name:
return None
if len(name) > 150:
return None
- return ' '.join(name.split())
+ return " ".join(name.split())
def filter_metadata(obj):
- if not (obj.get('title') and obj.get('authors')):
+ if not (obj.get("title") and obj.get("authors")):
return None
- title = filter_title(obj['title'])
+ title = filter_title(obj["title"])
if not title:
- #sys.stderr.write("bad title\n")
+ # sys.stderr.write("bad title\n")
return None
else:
- obj['title'] = title
- obj['authors'] = filter_authors(obj['authors'])
- obj['citations'] = filter_refs(obj['citations'])
- obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
+ obj["title"] = title
+ obj["authors"] = filter_authors(obj["authors"])
+ obj["citations"] = filter_refs(obj["citations"])
+ obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"])
return obj
def run(invert=False):
for line in sys.stdin:
- fields = line.split('\t')
+ fields = line.split("\t")
if len(fields) == 5:
raw = fields[4]
elif len(fields) == 1:
@@ -162,7 +177,7 @@ def run(invert=False):
fields[4] = processed
else:
fields[0] = processed
- print('\t'.join(fields))
+ print("\t".join(fields))
elif invert:
print(raw.strip())
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index fda9098..87dae16 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -31,15 +31,15 @@ REQUIRE_AUTHORS = False
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
def check_authors(left, right):
@@ -53,7 +53,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -61,21 +61,21 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
def test_check_authors():
assert check_authors([], []) == bool(not REQUIRE_AUTHORS)
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
# Rows are (score, left, right)
@@ -90,10 +90,10 @@ def process_group(rows):
left = json.loads(row[1])
right = json.loads(row[2])
# authors must roughly match
- if not check_authors(left['authors'], right['authors']):
+ if not check_authors(left["authors"], right["authors"]):
continue
# years must match (if defined)
- if left['year'] and right['year'] and left['year'] != right['year']:
+ if left["year"] and right["year"] and left["year"] != right["year"]:
continue
filtered.append((left, right))
@@ -105,8 +105,8 @@ def process_group(rows):
group_ids = set()
for row in filtered[1:]:
(left, right) = row
- l_id = left['fatcat_release']
- r_id = right['fatcat_release']
+ l_id = left["fatcat_release"]
+ r_id = right["fatcat_release"]
releases[l_id] = left
releases[r_id] = right
if not group_ids:
@@ -131,7 +131,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -146,5 +146,5 @@ def run():
process_group(lines)
-if __name__ == '__main__':
+if __name__ == "__main__":
run()
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3251852..c5b7eef 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -23,15 +23,15 @@ require_authors = 1
def tokenize(s, remove_whitespace=False):
- s.replace('&apos;', "'")
+ s.replace("&apos;", "'")
# Remove non-alphanumeric characters
- s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
+ s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
- s = ''.join(s.split())
+ s = "".join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
- return s.encode('ascii', 'replace').replace(b'?', b'')
+ return s.encode("ascii", "replace").replace(b"?", b"")
def check_authors(left, right):
@@ -45,7 +45,7 @@ def check_authors(left, right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
- l = left[i].lower().replace('jr.', '').split()
+ l = left[i].lower().replace("jr.", "").split()
if not l:
return False
l = tokenize(l[-1])
@@ -53,21 +53,21 @@ def check_authors(left, right):
# weird author name (single char)
return False
if l not in right_all:
- #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
+ # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
def test_check_authors():
assert not check_authors([], [])
- assert not check_authors([], ['one'])
- assert check_authors(['one'], ['one'])
- assert check_authors(['one two'], ['One Two'])
- assert check_authors(['two'], ['One Two'])
- assert check_authors(['two'], ['two, one'])
- assert check_authors(['mago'], ['Mr. Magoo'])
- assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
- assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+ assert not check_authors([], ["one"])
+ assert check_authors(["one"], ["one"])
+ assert check_authors(["one two"], ["One Two"])
+ assert check_authors(["two"], ["One Two"])
+ assert check_authors(["two"], ["two, one"])
+ assert check_authors(["mago"], ["Mr. Magoo"])
+ assert check_authors(["Mr. Magoo"], ["Mr Magoo"])
+ assert check_authors(["one", "tw", "thr"], ["one", "two", "three"])
# Rows are (score, grobid, crossref)
@@ -81,14 +81,14 @@ def process_group(rows):
continue
grobid = json.loads(row[1])
crossref = json.loads(row[2])
- if not check_authors(crossref['authors'], grobid['authors']):
- #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
+ if not check_authors(crossref["authors"], grobid["authors"]):
+ # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
continue
else:
- #print("YES: {} {}".format(crossref['authors'], grobid['authors']))
+ # print("YES: {} {}".format(crossref['authors'], grobid['authors']))
pass
- sha1 = grobid['sha1']
- doi = crossref['doi'].lower()
+ sha1 = grobid["sha1"]
+ doi = crossref["doi"].lower()
l = keepers.get(sha1, list())
l.append(doi)
keepers[sha1] = l
@@ -103,7 +103,7 @@ def run():
# group lines by slug, and process in batches
for line in sys.stdin:
- line = line.strip().split('\t')
+ line = line.strip().split("\t")
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
@@ -117,5 +117,5 @@ def run():
process_group(lines)
-if __name__ == '__main__':
+if __name__ == "__main__":
run()
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index b42153c..b01e46a 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -16,40 +16,40 @@ from grobid2json import teixml2json
def parse_hbase(line):
- line = line.split('\t')
+ line = line.split("\t")
assert len(line) == 2
sha1hex = line[0]
obj = json.loads(line[1])
- tei_xml = obj['tei_xml']
+ tei_xml = obj["tei_xml"]
return sha1hex, tei_xml
def parse_pg(line):
obj = json.loads(line)
- return obj['sha1hex'], obj['tei_xml']
+ return obj["sha1hex"], obj["tei_xml"]
-def run(mode='hbase'):
+def run(mode="hbase"):
for line in sys.stdin:
- if mode == 'hbase':
+ if mode == "hbase":
sha1hex, tei_xml = parse_hbase(line)
- elif mode == 'pg':
+ elif mode == "pg":
sha1hex, tei_xml = parse_pg(line)
else:
- raise NotImplementedError('parse mode: {}'.format(mode))
+ raise NotImplementedError("parse mode: {}".format(mode))
obj = teixml2json(tei_xml, encumbered=False)
affiliations = []
- for author in obj['authors']:
- if author.get('affiliation'):
- affiliations.append(author['affiliation'])
+ for author in obj["authors"]:
+ if author.get("affiliation"):
+ affiliations.append(author["affiliation"])
if affiliations:
# don't duplicate affiliations; only the unique ones
affiliations = list(set([json.dumps(a) for a in affiliations]))
affiliations = [json.loads(a) for a in affiliations]
- print('\t'.join([sha1hex, json.dumps(affiliations)]))
+ print("\t".join([sha1hex, json.dumps(affiliations)]))
-if __name__ == '__main__':
+if __name__ == "__main__":
run()
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index c9bc134..f941881 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -9,59 +9,59 @@ MAX_ABSTRACT_BYTES = 4096
def parse_grobid_json(obj):
- if not obj.get('title'):
+ if not obj.get("title"):
return None
extra = dict()
- if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES:
- abobj = dict(mimetype="text/plain", language=None, content=obj.get('abstract').strip())
+ if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES:
+ abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip())
abstracts = [abobj]
else:
abstracts = None
contribs = []
- for a in obj.get('authors', []):
+ for a in obj.get("authors", []):
c = dict(raw_name=a, role="author")
contribs.append(c)
refs = []
- for raw in obj.get('citations', []):
+ for raw in obj.get("citations", []):
extra = dict()
ref = dict()
- ref['key'] = raw.get('id')
- if raw.get('title'):
- ref['title'] = raw['title'].strip()
- if raw.get('date'):
+ ref["key"] = raw.get("id")
+ if raw.get("title"):
+ ref["title"] = raw["title"].strip()
+ if raw.get("date"):
try:
- year = int(raw['date'].strip()[:4])
- ref['year'] = year
+ year = int(raw["date"].strip()[:4])
+ ref["year"] = year
except:
pass
- for key in ('volume', 'url', 'issue', 'publisher'):
+ for key in ("volume", "url", "issue", "publisher"):
if raw.get(key):
extra[key] = raw[key].strip()
- if raw.get('authors'):
- extra['authors'] = [a['name'] for a in raw['authors']]
+ if raw.get("authors"):
+ extra["authors"] = [a["name"] for a in raw["authors"]]
if extra:
extra = dict(grobid=extra)
else:
extra = None
- ref['extra'] = extra
+ ref["extra"] = extra
refs.append(ref)
release_type = "journal-article"
release_date = None
- if obj.get('date'):
+ if obj.get("date"):
# TODO: only returns year, ever? how to handle?
- release_date = datetime.datetime(year=obj['date'], month=1, day=1)
+ release_date = datetime.datetime(year=obj["date"], month=1, day=1)
- if obj.get('doi'):
- extra['doi'] = obj['doi'].lower()
- if obj['journal'].get('name'):
- extra['container_name'] = obj['journal']['name']
+ if obj.get("doi"):
+ extra["doi"] = obj["doi"].lower()
+ if obj["journal"].get("name"):
+ extra["container_name"] = obj["journal"]["name"]
- extra['is_longtail_oa'] = True
+ extra["is_longtail_oa"] = True
# TODO: ISSN/eISSN handling? or just journal name lookup?
@@ -70,15 +70,17 @@ def parse_grobid_json(obj):
else:
extra = None
- return dict(title=obj['title'].strip(),
- contribs=contribs,
- publisher=obj['journal'].get('publisher'),
- volume=obj['journal'].get('volume'),
- issue=obj['journal'].get('issue'),
- abstracts=abstracts,
- release_type=release_type,
- release_date=release_date,
- extra=extra)
+ return dict(
+ title=obj["title"].strip(),
+ contribs=contribs,
+ publisher=obj["journal"].get("publisher"),
+ volume=obj["journal"].get("volume"),
+ issue=obj["journal"].get("issue"),
+ abstracts=abstracts,
+ release_type=release_type,
+ release_date=release_date,
+ extra=extra,
+ )
def run():
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 70731d5..d52e793 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -15,13 +15,13 @@ def transform(row):
"""
dict-to-dict
"""
- row.pop('created', None)
- extra = row.pop('request', None) or {}
- for k in ('ext_ids', 'edit_extra'):
+ row.pop("created", None)
+ extra = row.pop("request", None) or {}
+ for k in ("ext_ids", "edit_extra"):
if k in extra:
row[k] = extra[k]
- if 'release_ident' in extra:
- row['fatcat'] = dict(release_ident=extra['release_ident'])
+ if "release_ident" in extra:
+ row["fatcat"] = dict(release_ident=extra["release_ident"])
return row
@@ -38,9 +38,9 @@ def run(args):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="arabesque output file to use",
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "json_file", help="arabesque output file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
@@ -48,5 +48,5 @@ def main():
run(args)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 1f4a19f..9607b85 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -13,7 +13,6 @@ import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
@@ -26,17 +25,16 @@ DOMAIN_BLOCKLIST = [
"://archive.org/",
".archive.org/",
"://127.0.0.1/",
-
# OAI specific additions
"://hdl.handle.net/",
]
RELEASE_STAGE_MAP = {
- 'info:eu-repo/semantics/draftVersion': 'draft',
- 'info:eu-repo/semantics/submittedVersion': 'submitted',
- 'info:eu-repo/semantics/acceptedVersion': 'accepted',
- 'info:eu-repo/semantics/publishedVersion': 'published',
- 'info:eu-repo/semantics/updatedVersion': 'updated',
+ "info:eu-repo/semantics/draftVersion": "draft",
+ "info:eu-repo/semantics/submittedVersion": "submitted",
+ "info:eu-repo/semantics/acceptedVersion": "accepted",
+ "info:eu-repo/semantics/publishedVersion": "published",
+ "info:eu-repo/semantics/updatedVersion": "updated",
}
@@ -52,38 +50,38 @@ def transform(obj):
"""
requests = []
- if not obj.get('oai') or not obj['oai'].startswith('oai:'):
+ if not obj.get("oai") or not obj["oai"].startswith("oai:"):
return []
- if not obj.get('urls'):
+ if not obj.get("urls"):
return []
# look in obj['formats'] for PDF?
- if obj.get('formats'):
+ if obj.get("formats"):
# if there is a list of formats, and it does not contain PDF, then
# skip. Note that we will continue if there is no formats list.
has_pdf = False
- for f in obj['formats']:
- if 'pdf' in f.lower():
+ for f in obj["formats"]:
+ if "pdf" in f.lower():
has_pdf = True
if not has_pdf:
return []
doi = None
- if obj.get('doi'):
- doi = obj['doi'][0].lower().strip()
- if not doi.startswith('10.'):
+ if obj.get("doi"):
+ doi = obj["doi"][0].lower().strip()
+ if not doi.startswith("10."):
doi = None
# infer release stage and/or type from obj['types']
release_stage = None
- for t in obj.get('types', []):
+ for t in obj.get("types", []):
if t in RELEASE_STAGE_MAP:
release_stage = RELEASE_STAGE_MAP[t]
# TODO: infer rel somehow? Eg, repository vs. OJS publisher
rel = None
- for url in obj['urls']:
+ for url in obj["urls"]:
skip = False
for domain in DOMAIN_BLOCKLIST:
if domain in url:
@@ -96,18 +94,18 @@ def transform(obj):
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'oai',
- 'link_source_id': obj['oai'].lower(),
- 'ingest_request_source': 'metha-bulk',
- 'release_stage': release_stage,
- 'rel': rel,
- 'ext_ids': {
- 'doi': doi,
- 'oai': obj['oai'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "oai",
+ "link_source_id": obj["oai"].lower(),
+ "ingest_request_source": "metha-bulk",
+ "release_stage": release_stage,
+ "rel": rel,
+ "ext_ids": {
+ "doi": doi,
+ "oai": obj["oai"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
requests.append(request)
@@ -127,9 +125,11 @@ def run(args):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="OAI-PMH dump file to use (usually stdin)",
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "json_file",
+ help="OAI-PMH dump file to use (usually stdin)",
+ type=argparse.FileType("r"),
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
@@ -137,5 +137,5 @@ def main():
run(args)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index 3f81b3b..8b57c5b 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -22,15 +22,16 @@ def run(inpath, outpath):
renderer = poppler.PageRenderer()
full_page = renderer.render_page(page)
- img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw',
- "BGRA", 0, 1)
+ img = Image.frombuffer(
+ "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1
+ )
img.thumbnail((180, 300), Image.BICUBIC)
- #img.thumbnail((360,600), Image.BICUBIC)
+ # img.thumbnail((360,600), Image.BICUBIC)
img.save(outpath)
- #img.save(outpath, quality=95)
+ # img.save(outpath, quality=95)
-if __name__ == '__main__':
+if __name__ == "__main__":
if len(sys.argv) != 3:
print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr)
sys.exit(-1)
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index b79f316..ad5353b 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -11,7 +11,6 @@ import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
@@ -25,11 +24,11 @@ DOMAIN_BLOCKLIST = [
]
RELEASE_STAGE_MAP = {
- 'draftVersion': 'draft',
- 'submittedVersion': 'submitted',
- 'acceptedVersion': 'accepted',
- 'publishedVersion': 'published',
- 'updatedVersion': 'updated',
+ "draftVersion": "draft",
+ "submittedVersion": "submitted",
+ "acceptedVersion": "accepted",
+ "publishedVersion": "published",
+ "updatedVersion": "updated",
}
@@ -45,44 +44,44 @@ def transform(obj):
"""
requests = []
- if not obj['doi'].startswith('10.'):
+ if not obj["doi"].startswith("10."):
return requests
- if not obj['oa_locations']:
+ if not obj["oa_locations"]:
return requests
- for location in obj['oa_locations']:
- if not location['url_for_pdf']:
+ for location in obj["oa_locations"]:
+ if not location["url_for_pdf"]:
continue
skip = False
for domain in DOMAIN_BLOCKLIST:
- if domain in location['url_for_pdf']:
+ if domain in location["url_for_pdf"]:
skip = True
if skip:
continue
try:
- base_url = canon(location['url_for_pdf'])
+ base_url = canon(location["url_for_pdf"])
except UnicodeEncodeError:
continue
request = {
- 'base_url': base_url,
- 'ingest_type': 'pdf',
- 'link_source': 'unpaywall',
- 'link_source_id': obj['doi'].lower(),
- 'ingest_request_source': 'unpaywall',
- 'release_stage': RELEASE_STAGE_MAP.get(location['version']),
- 'rel': location['host_type'],
- 'ext_ids': {
- 'doi': obj['doi'].lower(),
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "unpaywall",
+ "link_source_id": obj["doi"].lower(),
+ "ingest_request_source": "unpaywall",
+ "release_stage": RELEASE_STAGE_MAP.get(location["version"]),
+ "rel": location["host_type"],
+ "ext_ids": {
+ "doi": obj["doi"].lower(),
},
- 'edit_extra': {},
+ "edit_extra": {},
}
- if obj.get('oa_status'):
- request['edit_extra']['oa_status'] = obj['oa_status']
- if location.get('evidence'):
- request['edit_extra']['evidence'] = location['evidence']
- if location['pmh_id']:
- request['ext_ids']['pmh_id'] = location['pmh_id']
+ if obj.get("oa_status"):
+ request["edit_extra"]["oa_status"] = obj["oa_status"]
+ if location.get("evidence"):
+ request["edit_extra"]["evidence"] = location["evidence"]
+ if location["pmh_id"]:
+ request["ext_ids"]["pmh_id"] = location["pmh_id"]
requests.append(request)
return requests
@@ -101,9 +100,9 @@ def run(args):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="unpaywall dump file to use",
- type=argparse.FileType('r'))
+ parser.add_argument(
+ "json_file", help="unpaywall dump file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
@@ -111,5 +110,5 @@ def main():
run(args)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()