diff options
57 files changed, 3991 insertions, 3126 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index 0d47f36..d92b351 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -202,7 +202,8 @@ def main() -> None: # pragma no cover json.dumps( teixml2json(content, encumbered=(not args.no_encumbered)), sort_keys=True, - )) + ) + ) if __name__ == "__main__": # pragma no cover diff --git a/python/grobid_tool.py b/python/grobid_tool.py index c36fe0a..f85d243 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -39,7 +39,7 @@ def run_extract_cdx(args): multi_worker, args.cdx_file, filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], + filter_mimetypes=["application/pdf"], batch_size=args.jobs, ) else: @@ -48,7 +48,7 @@ def run_extract_cdx(args): worker, args.cdx_file, filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], + filter_mimetypes=["application/pdf"], ) pusher.run() @@ -75,64 +75,74 @@ def run_transform(args): if args.metadata_only: out = grobid_client.metadata(line) else: - out = teixml2json(line['tei_xml']) + out = teixml2json(line["tei_xml"]) if out: - if 'source' in line: - out['source'] = line['source'] + if "source" in line: + out["source"] = line["source"] print(json.dumps(out)) def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--kafka-mode', - action='store_true', - help="send output to Kafka (not stdout)") - parser.add_argument('--kafka-hosts', - default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--kafka-env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa, dev)") - parser.add_argument('-j', - '--jobs', - default=8, - type=int, - help="parallelism for batch CPU jobs") - parser.add_argument('--grobid-host', - default="http://grobid.qa.fatcat.wiki", - help="GROBID API host/port") + parser.add_argument( + "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)" + ) + parser.add_argument( + "--kafka-hosts", + default="localhost:9092", + help="list of Kafka brokers (host/port) to use", + ) + parser.add_argument( + "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)" + ) + parser.add_argument( + "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs" + ) + parser.add_argument( + "--grobid-host", default="http://grobid.qa.fatcat.wiki", help="GROBID API host/port" + ) subparsers = parser.add_subparsers() sub_extract_json = subparsers.add_parser( - 'extract-json', - help="for each JSON line with CDX info, fetches PDF and does GROBID extraction") + "extract-json", + help="for each JSON line with CDX info, fetches PDF and does GROBID extraction", + ) sub_extract_json.set_defaults(func=run_extract_json) - sub_extract_json.add_argument('json_file', - help="JSON file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + sub_extract_json.add_argument( + "json_file", + help="JSON file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) sub_extract_cdx = subparsers.add_parser( - 'extract-cdx', help="for each CDX line, fetches PDF and does GROBID extraction") + "extract-cdx", help="for each CDX line, fetches PDF and does GROBID extraction" + ) sub_extract_cdx.set_defaults(func=run_extract_cdx) - sub_extract_cdx.add_argument('cdx_file', - help="CDX file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + sub_extract_cdx.add_argument( + "cdx_file", + help="CDX file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) sub_extract_zipfile = subparsers.add_parser( - 'extract-zipfile', - help="opens zipfile, iterates over PDF files inside and does GROBID extract for each") + "extract-zipfile", + help="opens zipfile, iterates over PDF files inside and does GROBID extract for each", + ) sub_extract_zipfile.set_defaults(func=run_extract_zipfile) - sub_extract_zipfile.add_argument('zip_file', help="zipfile with PDFs to extract", type=str) + sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str) - sub_transform = subparsers.add_parser('transform') + sub_transform = subparsers.add_parser("transform") sub_transform.set_defaults(func=run_transform) - sub_transform.add_argument('--metadata-only', - action='store_true', - help="Only pass through bibliographic metadata, not fulltext") sub_transform.add_argument( - 'json_file', + "--metadata-only", + action="store_true", + help="Only pass through bibliographic metadata, not fulltext", + ) + sub_transform.add_argument( + "json_file", help="convert TEI-XML to JSON. Input is JSON lines with tei_xml field", - type=argparse.FileType('r')) + type=argparse.FileType("r"), + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -148,5 +158,5 @@ def main(): args.func(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py index ac17003..8a95d95 100755 --- a/python/ia_pdf_match.py +++ b/python/ia_pdf_match.py @@ -27,69 +27,70 @@ from typing import Any, Dict, Optional def parse(obj: dict) -> Optional[Dict[str, Any]]: - if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'): - print('skip: test item', file=sys.stderr) + if obj["metadata"]["identifier"].endswith("-test") or obj["metadata"].get("test"): + print("skip: test item", file=sys.stderr) return None extid_type = None extid = None - if obj['metadata']['identifier'].startswith('arxiv-'): - extid_type = 'arxiv' - extid = obj['metadata'].get('source') + if obj["metadata"]["identifier"].startswith("arxiv-"): + extid_type = "arxiv" + extid = obj["metadata"].get("source") if not extid: - print('skip: no source', file=sys.stderr) + print("skip: no source", file=sys.stderr) return None - assert extid.startswith('http://arxiv.org/abs/') - extid = extid.replace('http://arxiv.org/abs/', '') - #print(extid) - assert '/' in extid or '.' in extid - if 'v' not in extid or not extid[-1].isdigit(): - print('skip: non-versioned arxiv_id', file=sys.stderr) + assert extid.startswith("http://arxiv.org/abs/") + extid = extid.replace("http://arxiv.org/abs/", "") + # print(extid) + assert "/" in extid or "." in extid + if "v" not in extid or not extid[-1].isdigit(): + print("skip: non-versioned arxiv_id", file=sys.stderr) return None - elif obj['metadata']['identifier'].startswith('paper-doi-10_'): - extid_type = 'doi' - extid = obj['metadata']['identifier-doi'] + elif obj["metadata"]["identifier"].startswith("paper-doi-10_"): + extid_type = "doi" + extid = obj["metadata"]["identifier-doi"] assert extid.startswith("10.") - elif obj['metadata']['identifier'].startswith('pubmed-PMC'): - extid_type = 'pmcid' - extid = obj['metadata']['identifier'].replace('pubmed-', '') + elif obj["metadata"]["identifier"].startswith("pubmed-PMC"): + extid_type = "pmcid" + extid = obj["metadata"]["identifier"].replace("pubmed-", "") assert extid.startswith("PMC") int(extid[3:]) - elif obj['metadata']['identifier'].startswith('jstor-'): - extid_type = 'jstor' - extid = obj['metadata']['identifier'].replace('jstor-', '') + elif obj["metadata"]["identifier"].startswith("jstor-"): + extid_type = "jstor" + extid = obj["metadata"]["identifier"].replace("jstor-", "") int(extid) else: raise NotImplementedError() pdf_file = None - for f in obj['files']: - if f['source'] == "original" and "PDF" in f['format']: + for f in obj["files"]: + if f["source"] == "original" and "PDF" in f["format"]: pdf_file = f break if not pdf_file: - print('skip: no PDF found: {}'.format(obj['metadata']['identifier']), file=sys.stderr) - #for f in obj['files']: + print("skip: no PDF found: {}".format(obj["metadata"]["identifier"]), file=sys.stderr) + # for f in obj['files']: # print(f['format'], file=sys.stderr) return None - assert pdf_file['name'].endswith('.pdf') + assert pdf_file["name"].endswith(".pdf") match = { - 'md5': pdf_file['md5'], - 'sha1': pdf_file['sha1'], - 'size': int(pdf_file['size']), - 'mimetype': 'application/pdf', - 'urls': [ - "https://archive.org/download/{}/{}".format(obj['metadata']['identifier'], - pdf_file['name']), + "md5": pdf_file["md5"], + "sha1": pdf_file["sha1"], + "size": int(pdf_file["size"]), + "mimetype": "application/pdf", + "urls": [ + "https://archive.org/download/{}/{}".format( + obj["metadata"]["identifier"], pdf_file["name"] + ), ], - 'cdx': [], - 'dois': [], + "cdx": [], + "dois": [], } - if extid_type == 'doi': - match['dois'] = [ + if extid_type == "doi": + match["dois"] = [ extid, ] else: @@ -108,5 +109,5 @@ def run() -> None: print(json.dumps(match, sort_keys=True)) -if __name__ == '__main__': +if __name__ == "__main__": run() diff --git a/python/ingest_tool.py b/python/ingest_tool.py index eb1047d..7405d28 100755 --- a/python/ingest_tool.py +++ b/python/ingest_tool.py @@ -17,9 +17,9 @@ def run_single_ingest(args): fatcat=dict(release_ident=args.release_id), ) if args.force_recrawl: - request['force_recrawl'] = True - if request['ingest_type'] in [ - 'dataset', + request["force_recrawl"] = True + if request["ingest_type"] in [ + "dataset", ]: ingester = IngestFilesetWorker( try_spn2=not args.no_spn2, @@ -41,11 +41,13 @@ def run_requests(args): try_spn2=not args.no_spn2, html_quick_mode=args.html_quick_mode, ) - fileset_worker = IngestFilesetWorker(try_spn2=not args.no_spn2, ) + fileset_worker = IngestFilesetWorker( + try_spn2=not args.no_spn2, + ) for line in args.json_file: request = json.loads(line.strip()) - if request['ingest_type'] in [ - 'dataset', + if request["ingest_type"] in [ + "dataset", ]: result = fileset_worker.process(request) else: @@ -56,7 +58,7 @@ def run_requests(args): def run_api(args): port = 8083 print("Listening on localhost:{}".format(port)) - server = HTTPServer(('', port), IngestFileRequestHandler) + server = HTTPServer(("", port), IngestFileRequestHandler) server.serve_forever() @@ -64,41 +66,52 @@ def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers() - sub_single = subparsers.add_parser('single', help="ingests a single base URL") + sub_single = subparsers.add_parser("single", help="ingests a single base URL") sub_single.set_defaults(func=run_single_ingest) - sub_single.add_argument('ingest_type', - default="pdf", - help="type of ingest (pdf, html, etc)") - sub_single.add_argument('--release-id', - help="(optional) existing release ident to match to") - sub_single.add_argument('--doi', help="(optional) existing release DOI to match to") - sub_single.add_argument('--force-recrawl', - action='store_true', - help="ignore GWB history and use SPNv2 to re-crawl") - sub_single.add_argument('--no-spn2', action='store_true', help="don't use live web (SPNv2)") - sub_single.add_argument('--html-quick-mode', - action='store_true', - help="don't fetch individual sub-resources, just use CDX") - sub_single.add_argument('url', help="URL of paper to fetch") + sub_single.add_argument( + "ingest_type", default="pdf", help="type of ingest (pdf, html, etc)" + ) + sub_single.add_argument( + "--release-id", help="(optional) existing release ident to match to" + ) + sub_single.add_argument("--doi", help="(optional) existing release DOI to match to") + sub_single.add_argument( + "--force-recrawl", + action="store_true", + help="ignore GWB history and use SPNv2 to re-crawl", + ) + sub_single.add_argument("--no-spn2", action="store_true", help="don't use live web (SPNv2)") + sub_single.add_argument( + "--html-quick-mode", + action="store_true", + help="don't fetch individual sub-resources, just use CDX", + ) + sub_single.add_argument("url", help="URL of paper to fetch") sub_requests = subparsers.add_parser( - 'requests', help="takes a series of ingest requests (JSON, per line) and runs each") - sub_requests.add_argument('--no-spn2', - action='store_true', - help="don't use live web (SPNv2)") - sub_requests.add_argument('--html-quick-mode', - action='store_true', - help="don't fetch individual sub-resources, just use CDX") + "requests", help="takes a series of ingest requests (JSON, per line) and runs each" + ) + sub_requests.add_argument( + "--no-spn2", action="store_true", help="don't use live web (SPNv2)" + ) + sub_requests.add_argument( + "--html-quick-mode", + action="store_true", + help="don't fetch individual sub-resources, just use CDX", + ) sub_requests.set_defaults(func=run_requests) - sub_requests.add_argument('json_file', - help="JSON file (request per line) to import from (or stdin)", - default=sys.stdin, - type=argparse.FileType('r')) + sub_requests.add_argument( + "json_file", + help="JSON file (request per line) to import from (or stdin)", + default=sys.stdin, + type=argparse.FileType("r"), + ) sub_api = subparsers.add_parser( - 'api', help="starts a simple HTTP server that processes ingest requests") + "api", help="starts a simple HTTP server that processes ingest requests" + ) sub_api.set_defaults(func=run_api) - sub_api.add_argument('--port', help="HTTP port to listen on", default=8033, type=int) + sub_api.add_argument("--port", help="HTTP port to listen on", default=8033, type=int) args = parser.parse_args() if not args.__dict__.get("func"): @@ -108,5 +121,5 @@ def main(): args.func(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py index f3df6e3..28d6397 100755 --- a/python/pdfextract_tool.py +++ b/python/pdfextract_tool.py @@ -16,9 +16,9 @@ def run_extract_json(args): multi_worker = MultiprocessWrapper(worker, args.sink) pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs) else: - worker = PdfExtractWorker(wayback_client, - sink=args.sink, - thumbnail_sink=args.thumbnail_sink) + worker = PdfExtractWorker( + wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink + ) pusher = JsonLinePusher(worker, args.json_file) pusher.run() @@ -32,18 +32,18 @@ def run_extract_cdx(args): multi_worker, args.cdx_file, filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], + filter_mimetypes=["application/pdf"], batch_size=args.jobs, ) else: - worker = PdfExtractWorker(wayback_client, - sink=args.sink, - thumbnail_sink=args.thumbnail_sink) + worker = PdfExtractWorker( + wayback_client, sink=args.sink, thumbnail_sink=args.thumbnail_sink + ) pusher = CdxLinePusher( worker, args.cdx_file, filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], + filter_mimetypes=["application/pdf"], ) pusher.run() @@ -62,7 +62,7 @@ def run_extract_zipfile(args): def run_single(args): worker = PdfExtractBlobWorker(sink=args.sink, thumbnail_sink=args.thumbnail_sink) - with open(args.pdf_file, 'rb') as pdf_file: + with open(args.pdf_file, "rb") as pdf_file: pdf_bytes = pdf_file.read() worker.push_record(pdf_bytes) worker.finish() @@ -72,46 +72,53 @@ def run_single(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--kafka-mode', - action='store_true', - help="send output to Kafka (not stdout)") - parser.add_argument('--kafka-hosts', - default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--kafka-env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa, dev)") - parser.add_argument('-j', - '--jobs', - default=8, - type=int, - help="parallelism for batch CPU jobs") + parser.add_argument( + "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)" + ) + parser.add_argument( + "--kafka-hosts", + default="localhost:9092", + help="list of Kafka brokers (host/port) to use", + ) + parser.add_argument( + "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)" + ) + parser.add_argument( + "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs" + ) subparsers = parser.add_subparsers() sub_extract_json = subparsers.add_parser( - 'extract-json', - help="for each JSON line with CDX info, fetches PDF and does PDF extraction") + "extract-json", + help="for each JSON line with CDX info, fetches PDF and does PDF extraction", + ) sub_extract_json.set_defaults(func=run_extract_json) - sub_extract_json.add_argument('json_file', - help="JSON file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + sub_extract_json.add_argument( + "json_file", + help="JSON file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) sub_extract_cdx = subparsers.add_parser( - 'extract-cdx', help="for each CDX line, fetches PDF and does PDF extraction") + "extract-cdx", help="for each CDX line, fetches PDF and does PDF extraction" + ) sub_extract_cdx.set_defaults(func=run_extract_cdx) - sub_extract_cdx.add_argument('cdx_file', - help="CDX file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + sub_extract_cdx.add_argument( + "cdx_file", + help="CDX file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) sub_extract_zipfile = subparsers.add_parser( - 'extract-zipfile', - help="opens zipfile, iterates over PDF files inside and does PDF extract for each") + "extract-zipfile", + help="opens zipfile, iterates over PDF files inside and does PDF extract for each", + ) sub_extract_zipfile.set_defaults(func=run_extract_zipfile) - sub_extract_zipfile.add_argument('zip_file', help="zipfile with PDFs to extract", type=str) + sub_extract_zipfile.add_argument("zip_file", help="zipfile with PDFs to extract", type=str) - sub_single = subparsers.add_parser('single', help="opens single PDF and extracts it") + sub_single = subparsers.add_parser("single", help="opens single PDF and extracts it") sub_single.set_defaults(func=run_single) - sub_single.add_argument('pdf_file', help="single PDF to extract", type=str) + sub_single.add_argument("pdf_file", help="single PDF to extract", type=str) args = parser.parse_args() if not args.__dict__.get("func"): @@ -124,11 +131,15 @@ def main(): text_topic = "sandcrawler-{}.pdf-text".format(args.kafka_env) thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.kafka_env) args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=text_topic) - args.thumbnail_sink = KafkaSink(kafka_hosts=args.kafka_hosts, - produce_topic=thumbnail_topic) - print("Running in kafka output mode, publishing to {} and {}\n".format( - text_topic, thumbnail_topic), - file=sys.stderr) + args.thumbnail_sink = KafkaSink( + kafka_hosts=args.kafka_hosts, produce_topic=thumbnail_topic + ) + print( + "Running in kafka output mode, publishing to {} and {}\n".format( + text_topic, thumbnail_topic + ), + file=sys.stderr, + ) else: args.sink = None args.thumbnail_sink = None @@ -136,5 +147,5 @@ def main(): args.func(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py index dbe5b10..9d3010e 100755 --- a/python/pdftrio_tool.py +++ b/python/pdftrio_tool.py @@ -18,17 +18,15 @@ def run_classify_pdf_json(args): pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host) wayback_client = WaybackClient() if args.jobs > 1: - worker = PdfTrioWorker(pdftrio_client, - wayback_client, - sink=None, - mode=args.pdftrio_mode) + worker = PdfTrioWorker( + pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode + ) multi_worker = MultiprocessWrapper(worker, args.sink) pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs) else: - worker = PdfTrioWorker(pdftrio_client, - wayback_client, - sink=args.sink, - mode=args.pdftrio_mode) + worker = PdfTrioWorker( + pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode + ) pusher = JsonLinePusher(worker, args.json_file) pusher.run() @@ -37,28 +35,26 @@ def run_classify_pdf_cdx(args): pdftrio_client = PdfTrioClient(host_url=args.pdftrio_host) wayback_client = WaybackClient() if args.jobs > 1: - worker = PdfTrioWorker(pdftrio_client, - wayback_client, - sink=None, - mode=args.pdftrio_mode) + worker = PdfTrioWorker( + pdftrio_client, wayback_client, sink=None, mode=args.pdftrio_mode + ) multi_worker = MultiprocessWrapper(worker, args.sink) pusher = CdxLinePusher( multi_worker, args.cdx_file, filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], + filter_mimetypes=["application/pdf"], batch_size=args.jobs, ) else: - worker = PdfTrioWorker(pdftrio_client, - wayback_client, - sink=args.sink, - mode=args.pdftrio_mode) + worker = PdfTrioWorker( + pdftrio_client, wayback_client, sink=args.sink, mode=args.pdftrio_mode + ) pusher = CdxLinePusher( worker, args.cdx_file, filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], + filter_mimetypes=["application/pdf"], ) pusher.run() @@ -72,52 +68,58 @@ def run_classify_pdf_zipfile(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--kafka-mode', - action='store_true', - help="send output to Kafka (not stdout)") - parser.add_argument('--kafka-hosts', - default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--kafka-env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa, dev)") - parser.add_argument('-j', - '--jobs', - default=8, - type=int, - help="parallelism for batch CPU jobs") - parser.add_argument('--pdftrio-host', - default="http://pdftrio.qa.fatcat.wiki", - help="pdftrio API host/port") - parser.add_argument('--pdftrio-mode', - default="auto", - help="which classification mode to use") + parser.add_argument( + "--kafka-mode", action="store_true", help="send output to Kafka (not stdout)" + ) + parser.add_argument( + "--kafka-hosts", + default="localhost:9092", + help="list of Kafka brokers (host/port) to use", + ) + parser.add_argument( + "--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)" + ) + parser.add_argument( + "-j", "--jobs", default=8, type=int, help="parallelism for batch CPU jobs" + ) + parser.add_argument( + "--pdftrio-host", default="http://pdftrio.qa.fatcat.wiki", help="pdftrio API host/port" + ) + parser.add_argument( + "--pdftrio-mode", default="auto", help="which classification mode to use" + ) subparsers = parser.add_subparsers() sub_classify_pdf_json = subparsers.add_parser( - 'classify-pdf-json', - help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion") + "classify-pdf-json", + help="for each JSON line with CDX info, fetches PDF and does pdftrio classify_pdfion", + ) sub_classify_pdf_json.set_defaults(func=run_classify_pdf_json) - sub_classify_pdf_json.add_argument('json_file', - help="JSON file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + sub_classify_pdf_json.add_argument( + "json_file", + help="JSON file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) sub_classify_pdf_cdx = subparsers.add_parser( - 'classify-pdf-cdx', - help="for each CDX line, fetches PDF and does pdftrio classify_pdfion") + "classify-pdf-cdx", + help="for each CDX line, fetches PDF and does pdftrio classify_pdfion", + ) sub_classify_pdf_cdx.set_defaults(func=run_classify_pdf_cdx) - sub_classify_pdf_cdx.add_argument('cdx_file', - help="CDX file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + sub_classify_pdf_cdx.add_argument( + "cdx_file", + help="CDX file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) sub_classify_pdf_zipfile = subparsers.add_parser( - 'classify-pdf-zipfile', - help= - "opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each") + "classify-pdf-zipfile", + help="opens zipfile, iterates over PDF files inside and does pdftrio classify_pdf for each", + ) sub_classify_pdf_zipfile.set_defaults(func=run_classify_pdf_zipfile) - sub_classify_pdf_zipfile.add_argument('zip_file', - help="zipfile with PDFs to classify", - type=str) + sub_classify_pdf_zipfile.add_argument( + "zip_file", help="zipfile with PDFs to classify", type=str + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -133,5 +135,5 @@ def main(): args.func(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/persist_tool.py b/python/persist_tool.py index 9160db6..b124ddc 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -15,8 +15,10 @@ from sandcrawler.persist import * def run_cdx(args): - worker = PersistCdxWorker(db_url=args.db_url, ) - filter_mimetypes = ['application/pdf'] + worker = PersistCdxWorker( + db_url=args.db_url, + ) + filter_mimetypes = ["application/pdf"] if args.no_mimetype_filter: filter_mimetypes = None pusher = CdxLinePusher( @@ -24,7 +26,7 @@ def run_cdx(args): args.cdx_file, filter_http_statuses=[200, 226], filter_mimetypes=filter_mimetypes, - #allow_octet_stream + # allow_octet_stream batch_size=200, ) pusher.run() @@ -53,7 +55,9 @@ def run_grobid_disk(args): Writes XML to individual files on disk, and also prints non-XML metadata to stdout as JSON, which can be redirected to a separate file. """ - worker = PersistGrobidDiskWorker(output_dir=args.output_dir, ) + worker = PersistGrobidDiskWorker( + output_dir=args.output_dir, + ) pusher = JsonLinePusher( worker, args.json_file, @@ -62,7 +66,9 @@ def run_grobid_disk(args): def run_pdftrio(args): - worker = PersistPdfTrioWorker(db_url=args.db_url, ) + worker = PersistPdfTrioWorker( + db_url=args.db_url, + ) pusher = JsonLinePusher( worker, args.json_file, @@ -90,7 +96,9 @@ def run_pdftext(args): def run_ingest_file_result(args): - worker = PersistIngestFileResultWorker(db_url=args.db_url, ) + worker = PersistIngestFileResultWorker( + db_url=args.db_url, + ) pusher = JsonLinePusher( worker, args.json_file, @@ -100,7 +108,9 @@ def run_ingest_file_result(args): def run_ingest_request(args): - worker = PersistIngestRequestWorker(db_url=args.db_url, ) + worker = PersistIngestRequestWorker( + db_url=args.db_url, + ) pusher = JsonLinePusher( worker, args.json_file, @@ -111,92 +121,122 @@ def run_ingest_request(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--db-url', - help="postgresql database connection string", - default="postgres:///sandcrawler") - parser.add_argument('--s3-url', help="S3 (seaweedfs) backend URL", default="localhost:9000") - parser.add_argument('--s3-access-key', - help="S3 (seaweedfs) credential", - default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') - or os.environ.get('MINIO_ACCESS_KEY')) - parser.add_argument('--s3-secret-key', - help="S3 (seaweedfs) credential", - default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') - or os.environ.get('MINIO_SECRET_KEY')) - parser.add_argument('--s3-bucket', - help="S3 (seaweedfs) bucket to persist into", - default="sandcrawler-dev") + parser.add_argument( + "--db-url", + help="postgresql database connection string", + default="postgres:///sandcrawler", + ) + parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000") + parser.add_argument( + "--s3-access-key", + help="S3 (seaweedfs) credential", + default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY") + or os.environ.get("MINIO_ACCESS_KEY"), + ) + parser.add_argument( + "--s3-secret-key", + help="S3 (seaweedfs) credential", + default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY") + or os.environ.get("MINIO_SECRET_KEY"), + ) + parser.add_argument( + "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev" + ) subparsers = parser.add_subparsers() - sub_cdx = subparsers.add_parser('cdx', help="backfill a CDX file into postgresql cdx table") + sub_cdx = subparsers.add_parser("cdx", help="backfill a CDX file into postgresql cdx table") sub_cdx.set_defaults(func=run_cdx) - sub_cdx.add_argument('cdx_file', - help="CDX file to import from (or '-' for stdin)", - type=argparse.FileType('r')) sub_cdx.add_argument( - '--no-mimetype-filter', - action='store_true', - help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)") + "cdx_file", + help="CDX file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) + sub_cdx.add_argument( + "--no-mimetype-filter", + action="store_true", + help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)", + ) sub_grobid = subparsers.add_parser( - 'grobid', help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)") + "grobid", help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)" + ) sub_grobid.set_defaults(func=run_grobid) - sub_grobid.add_argument('json_file', - help="grobid file to import from (or '-' for stdin)", - type=argparse.FileType('r')) - sub_grobid.add_argument('--s3-only', - action='store_true', - help="only upload TEI-XML to S3 (don't write to database)") sub_grobid.add_argument( - '--db-only', - action='store_true', - help="only write status to sandcrawler-db (don't save TEI-XML to S3)") + "json_file", + help="grobid file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) + sub_grobid.add_argument( + "--s3-only", + action="store_true", + help="only upload TEI-XML to S3 (don't write to database)", + ) + sub_grobid.add_argument( + "--db-only", + action="store_true", + help="only write status to sandcrawler-db (don't save TEI-XML to S3)", + ) sub_pdftext = subparsers.add_parser( - 'pdftext', - help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)") + "pdftext", help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)" + ) sub_pdftext.set_defaults(func=run_pdftext) - sub_pdftext.add_argument('json_file', - help="pdftext file to import from (or '-' for stdin)", - type=argparse.FileType('r')) - sub_pdftext.add_argument('--s3-only', - action='store_true', - help="only upload TEI-XML to S3 (don't write to database)") sub_pdftext.add_argument( - '--db-only', - action='store_true', - help="only write status to sandcrawler-db (don't save TEI-XML to S3)") + "json_file", + help="pdftext file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) + sub_pdftext.add_argument( + "--s3-only", + action="store_true", + help="only upload TEI-XML to S3 (don't write to database)", + ) + sub_pdftext.add_argument( + "--db-only", + action="store_true", + help="only write status to sandcrawler-db (don't save TEI-XML to S3)", + ) - sub_grobid_disk = subparsers.add_parser('grobid-disk', - help="dump GRBOID output to (local) files on disk") + sub_grobid_disk = subparsers.add_parser( + "grobid-disk", help="dump GRBOID output to (local) files on disk" + ) sub_grobid_disk.set_defaults(func=run_grobid_disk) - sub_grobid_disk.add_argument('json_file', - help="grobid file to import from (or '-' for stdin)", - type=argparse.FileType('r')) - sub_grobid_disk.add_argument('output_dir', help="base directory to output into", type=str) + sub_grobid_disk.add_argument( + "json_file", + help="grobid file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) + sub_grobid_disk.add_argument("output_dir", help="base directory to output into", type=str) sub_pdftrio = subparsers.add_parser( - 'pdftrio', - help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)") + "pdftrio", help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)" + ) sub_pdftrio.set_defaults(func=run_pdftrio) - sub_pdftrio.add_argument('json_file', - help="pdftrio file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + sub_pdftrio.add_argument( + "json_file", + help="pdftrio file to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) sub_ingest_file_result = subparsers.add_parser( - 'ingest-file-result', help="backfill a ingest_file_result JSON dump into postgresql") + "ingest-file-result", help="backfill a ingest_file_result JSON dump into postgresql" + ) sub_ingest_file_result.set_defaults(func=run_ingest_file_result) sub_ingest_file_result.add_argument( - 'json_file', + "json_file", help="ingest_file_result file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + type=argparse.FileType("r"), + ) sub_ingest_request = subparsers.add_parser( - 'ingest-request', help="backfill a ingest_request JSON dump into postgresql") + "ingest-request", help="backfill a ingest_request JSON dump into postgresql" + ) sub_ingest_request.set_defaults(func=run_ingest_request) - sub_ingest_request.add_argument('json_file', - help="ingest_request to import from (or '-' for stdin)", - type=argparse.FileType('r')) + sub_ingest_request.add_argument( + "json_file", + help="ingest_request to import from (or '-' for stdin)", + type=argparse.FileType("r"), + ) args = parser.parse_args() if not args.__dict__.get("func"): @@ -206,5 +246,5 @@ def main(): args.func(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 46735eb..6718c57 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,16 +1,48 @@ from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker -from .ia import (CdxApiClient, CdxApiError, CdxPartial, CdxRow, PetaboxError, ResourceResult, - SavePageNowClient, SavePageNowError, WarcResource, WaybackClient, - WaybackContentError, WaybackError) +from .ia import ( + CdxApiClient, + CdxApiError, + CdxPartial, + CdxRow, + PetaboxError, + ResourceResult, + SavePageNowClient, + SavePageNowError, + WarcResource, + WaybackClient, + WaybackContentError, + WaybackError, +) from .ingest_file import IngestFileWorker from .ingest_fileset import IngestFilesetWorker -from .misc import (b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, - parse_cdx_datetime, parse_cdx_line) +from .misc import ( + b32_hex, + clean_url, + gen_file_metadata, + gen_file_metadata_path, + parse_cdx_datetime, + parse_cdx_line, +) from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker -from .persist import (PersistCdxWorker, PersistGrobidDiskWorker, PersistGrobidWorker, - PersistIngestFileResultWorker, PersistIngestRequestWorker, - PersistPdfTextWorker, PersistPdfTrioWorker, PersistThumbnailWorker) -from .workers import (BlackholeSink, CdxLinePusher, JsonLinePusher, KafkaCompressSink, - KafkaJsonPusher, KafkaSink, MultiprocessWrapper, ZipfilePusher) +from .persist import ( + PersistCdxWorker, + PersistGrobidDiskWorker, + PersistGrobidWorker, + PersistIngestFileResultWorker, + PersistIngestRequestWorker, + PersistPdfTextWorker, + PersistPdfTrioWorker, + PersistThumbnailWorker, +) +from .workers import ( + BlackholeSink, + CdxLinePusher, + JsonLinePusher, + KafkaCompressSink, + KafkaJsonPusher, + KafkaSink, + MultiprocessWrapper, + ZipfilePusher, +) diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index ee4d3bf..69d2116 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -12,12 +12,12 @@ class SandcrawlerPostgrestClient: self.api_url = api_url def get_cdx(self, url: str) -> Optional[dict]: - resp = requests.get(self.api_url + "/cdx", params=dict(url='eq.' + url)) + resp = requests.get(self.api_url + "/cdx", params=dict(url="eq." + url)) resp.raise_for_status() return resp.json() or None def get_grobid(self, sha1: str) -> Optional[dict]: - resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.' + sha1)) + resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1)) resp.raise_for_status() resp_json = resp.json() if resp_json: @@ -26,7 +26,7 @@ class SandcrawlerPostgrestClient: return None def get_pdftrio(self, sha1: str) -> Optional[dict]: - resp = requests.get(self.api_url + "/pdftrio", params=dict(sha1hex='eq.' + sha1)) + resp = requests.get(self.api_url + "/pdftrio", params=dict(sha1hex="eq." + sha1)) resp.raise_for_status() resp_json = resp.json() if resp_json: @@ -35,7 +35,7 @@ class SandcrawlerPostgrestClient: return None def get_pdf_meta(self, sha1: str) -> Optional[dict]: - resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex='eq.' + sha1)) + resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)) resp.raise_for_status() resp_json = resp.json() if resp_json: @@ -56,7 +56,7 @@ class SandcrawlerPostgrestClient: return None def get_file_meta(self, sha1: str) -> Optional[dict]: - resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex='eq.' + sha1)) + resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex="eq." + sha1)) resp.raise_for_status() resp_json = resp.json() if resp_json: @@ -89,7 +89,7 @@ class SandcrawlerPostgrestClient: return None def get_crossref(self, doi: str) -> Optional[dict]: - resp = requests.get(self.api_url + "/crossref", params=dict(doi='eq.' + doi)) + resp = requests.get(self.api_url + "/crossref", params=dict(doi="eq." + doi)) resp.raise_for_status() resp_json = resp.json() if resp_json: @@ -117,10 +117,12 @@ class SandcrawlerPostgresClient: updates = 0 return (inserts, updates) - def insert_cdx(self, - cur: psycopg2.extensions.cursor, - batch: List[Dict[str, Any]], - on_conflict: str = "nothing") -> Tuple[int, int]: + def insert_cdx( + self, + cur: psycopg2.extensions.cursor, + batch: List[Dict[str, Any]], + on_conflict: str = "nothing", + ) -> Tuple[int, int]: sql = """ INSERT INTO cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) @@ -133,11 +135,21 @@ class SandcrawlerPostgresClient: raise NotImplementedError("on_conflict: {}".format(on_conflict)) sql += " RETURNING xmax;" - batch = [d for d in batch if d.get('warc_path')] + batch = [d for d in batch if d.get("warc_path")] if not batch: return (0, 0) - rows = [(d['url'], d['datetime'], d['sha1hex'], d['mimetype'], d['warc_path'], - int(d['warc_csize']), int(d['warc_offset'])) for d in batch] + rows = [ + ( + d["url"], + d["datetime"], + d["sha1hex"], + d["mimetype"], + d["warc_path"], + int(d["warc_csize"]), + int(d["warc_offset"]), + ) + for d in batch + ] # filter out duplicate rows by key (url, datetime) row_dict = dict() for b in rows: @@ -146,10 +158,12 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) - def insert_file_meta(self, - cur: psycopg2.extensions.cursor, - batch: List[Dict[str, Any]], - on_conflict: str = "nothing") -> Tuple[int, int]: + def insert_file_meta( + self, + cur: psycopg2.extensions.cursor, + batch: List[Dict[str, Any]], + on_conflict: str = "nothing", + ) -> Tuple[int, int]: sql = """ INSERT INTO file_meta(sha1hex, sha256hex, md5hex, size_bytes, mimetype) @@ -168,8 +182,10 @@ class SandcrawlerPostgresClient: else: raise NotImplementedError("on_conflict: {}".format(on_conflict)) sql += " RETURNING xmax;" - rows = [(d['sha1hex'], d['sha256hex'], d['md5hex'], int(d['size_bytes']), d['mimetype']) - for d in batch] + rows = [ + (d["sha1hex"], d["sha256hex"], d["md5hex"], int(d["size_bytes"]), d["mimetype"]) + for d in batch + ] # filter out duplicate rows by key (sha1hex) row_dict = dict() for b in rows: @@ -178,10 +194,12 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) - def insert_grobid(self, - cur: psycopg2.extensions.cursor, - batch: List[Dict[str, Any]], - on_conflict: str = "nothing") -> Tuple[int, int]: + def insert_grobid( + self, + cur: psycopg2.extensions.cursor, + batch: List[Dict[str, Any]], + on_conflict: str = "nothing", + ) -> Tuple[int, int]: sql = """ INSERT INTO grobid (sha1hex, grobid_version, status_code, status, fatcat_release, updated, metadata) @@ -203,24 +221,27 @@ class SandcrawlerPostgresClient: raise NotImplementedError("on_conflict: {}".format(on_conflict)) sql += " RETURNING xmax;" for r in batch: - if r.get('metadata'): + if r.get("metadata"): # sometimes these are only in metadata; shouldn't pass through # though (to save database space) - dupe_fields = ('fatcat_release', 'grobid_version') + dupe_fields = ("fatcat_release", "grobid_version") for k in dupe_fields: if k not in r: - r[k] = r['metadata'].get(k) - r['metadata'].pop(k, None) - r['metadata'] = json.dumps(r['metadata'], sort_keys=True) - rows = [( - d['key'], - d.get('grobid_version') or None, - d['status_code'], - d['status'], - d.get('fatcat_release') or None, - d.get('updated') or datetime.datetime.now(), - d.get('metadata') or None, - ) for d in batch] + r[k] = r["metadata"].get(k) + r["metadata"].pop(k, None) + r["metadata"] = json.dumps(r["metadata"], sort_keys=True) + rows = [ + ( + d["key"], + d.get("grobid_version") or None, + d["status_code"], + d["status"], + d.get("fatcat_release") or None, + d.get("updated") or datetime.datetime.now(), + d.get("metadata") or None, + ) + for d in batch + ] # filter out duplicate rows by key (sha1hex) row_dict = dict() for b in rows: @@ -229,10 +250,9 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) - def insert_pdf_meta(self, - cur: psycopg2.extensions.cursor, - rows: List[Tuple], - on_conflict: str = "nothing") -> Tuple[int, int]: + def insert_pdf_meta( + self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing" + ) -> Tuple[int, int]: """ batch elements are expected to have .to_sql_tuple() method """ @@ -269,10 +289,9 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) - def insert_html_meta(self, - cur: psycopg2.extensions.cursor, - rows: List[Tuple], - on_conflict: str = "nothing") -> Tuple[int, int]: + def insert_html_meta( + self, cur: psycopg2.extensions.cursor, rows: List[Tuple], on_conflict: str = "nothing" + ) -> Tuple[int, int]: """ batch elements are expected to have .to_sql_tuple() method """ @@ -306,10 +325,12 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) - def insert_pdftrio(self, - cur: psycopg2.extensions.cursor, - batch: List[Dict[str, Any]], - on_conflict: str = "nothing") -> Tuple[int, int]: + def insert_pdftrio( + self, + cur: psycopg2.extensions.cursor, + batch: List[Dict[str, Any]], + on_conflict: str = "nothing", + ) -> Tuple[int, int]: sql = """ INSERT INTO pdftrio (sha1hex, updated, status_code, status, pdftrio_version, @@ -335,18 +356,21 @@ class SandcrawlerPostgresClient: else: raise NotImplementedError("on_conflict: {}".format(on_conflict)) sql += " RETURNING xmax;" - rows = [( - d['key'], - d.get('updated') or datetime.datetime.now(), - d['status_code'], - d['status'], - d.get('versions', {}).get('pdftrio_version') or None, - d.get('versions', {}).get('models_date') or None, - d.get('ensemble_score'), - d.get('bert_score'), - d.get('linear_score'), - d.get('image_score'), - ) for d in batch] + rows = [ + ( + d["key"], + d.get("updated") or datetime.datetime.now(), + d["status_code"], + d["status"], + d.get("versions", {}).get("pdftrio_version") or None, + d.get("versions", {}).get("models_date") or None, + d.get("ensemble_score"), + d.get("bert_score"), + d.get("linear_score"), + d.get("image_score"), + ) + for d in batch + ] # filter out duplicate rows by key (sha1hex) row_dict = dict() for b in rows: @@ -355,10 +379,12 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) - def insert_ingest_request(self, - cur: psycopg2.extensions.cursor, - batch: List[Dict[str, Any]], - on_conflict: str = "nothing") -> Tuple[int, int]: + def insert_ingest_request( + self, + cur: psycopg2.extensions.cursor, + batch: List[Dict[str, Any]], + on_conflict: str = "nothing", + ) -> Tuple[int, int]: sql = """ INSERT INTO ingest_request (link_source, link_source_id, ingest_type, base_url, ingest_request_source, release_stage, request) @@ -372,21 +398,24 @@ class SandcrawlerPostgresClient: sql += " RETURNING xmax;" for r in batch: # in case these fields were already packed into 'request' - extra = r.get('request', {}) - for k in ('ext_ids', 'fatcat_release', 'edit_extra', 'rel'): + extra = r.get("request", {}) + for k in ("ext_ids", "fatcat_release", "edit_extra", "rel"): if r.get(k): extra[k] = r[k] if extra: - r['extra'] = json.dumps(extra, sort_keys=True) - rows = [( - d['link_source'], - d['link_source_id'], - d['ingest_type'], - d['base_url'], - d.get('ingest_request_source'), - d.get('release_stage') or None, - d.get('extra') or None, - ) for d in batch] + r["extra"] = json.dumps(extra, sort_keys=True) + rows = [ + ( + d["link_source"], + d["link_source_id"], + d["ingest_type"], + d["base_url"], + d.get("ingest_request_source"), + d.get("release_stage") or None, + d.get("extra") or None, + ) + for d in batch + ] # filter out duplicate rows by key (link_source, link_source_id, ingest_type, base_url) row_dict = dict() for b in rows: @@ -395,10 +424,12 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) - def insert_ingest_file_result(self, - cur: psycopg2.extensions.cursor, - batch: List[Dict[str, Any]], - on_conflict: str = "nothing") -> Tuple[int, int]: + def insert_ingest_file_result( + self, + cur: psycopg2.extensions.cursor, + batch: List[Dict[str, Any]], + on_conflict: str = "nothing", + ) -> Tuple[int, int]: sql = """ INSERT INTO ingest_file_result (ingest_type, base_url, hit, status, terminal_url, terminal_dt, terminal_status_code, terminal_sha1hex) @@ -420,16 +451,19 @@ class SandcrawlerPostgresClient: else: raise NotImplementedError("on_conflict: {}".format(on_conflict)) sql += " RETURNING xmax;" - rows = [( - d['ingest_type'], - d['base_url'], - bool(d['hit']), - d['status'], - d.get('terminal_url'), - d.get('terminal_dt'), - d.get('terminal_status_code'), - d.get('terminal_sha1hex'), - ) for d in batch] + rows = [ + ( + d["ingest_type"], + d["base_url"], + bool(d["hit"]), + d["status"], + d.get("terminal_url"), + d.get("terminal_dt"), + d.get("terminal_status_code"), + d.get("terminal_sha1hex"), + ) + for d in batch + ] # filter out duplicate rows by key (ingest_type, base_url) row_dict = dict() for b in rows: @@ -438,10 +472,12 @@ class SandcrawlerPostgresClient: resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) - def insert_ingest_fileset_platform(self, - cur: psycopg2.extensions.cursor, - batch: List[Dict[str, Any]], - on_conflict: str = "nothing") -> Tuple[int, int]: + def insert_ingest_fileset_platform( + self, + cur: psycopg2.extensions.cursor, + batch: List[Dict[str, Any]], + on_conflict: str = "nothing", + ) -> Tuple[int, int]: sql = """ INSERT INTO ingest_fileset_platform (ingest_type, base_url, hit, status, platform_name, platform_domain, platform_id, ingest_strategy, total_size, file_count, archiveorg_item_name, archiveorg_item_bundle_path, web_bundle_url, web_bundle_dt, manifest) @@ -470,23 +506,26 @@ class SandcrawlerPostgresClient: else: raise NotImplementedError("on_conflict: {}".format(on_conflict)) sql += " RETURNING xmax;" - rows = [( - d['ingest_type'], - d['base_url'], - bool(d['hit']), - d['status'], - d.get('platform_name'), - d.get('platform_domain'), - d.get('platform_id'), - d.get('ingest_strategy'), - d.get('total_size'), - d.get('file_count'), - d.get('archiveorg_item_name'), - d.get('archiveorg_item_bundle_path'), - d.get('web_bundle_url'), - d.get('web_bundle_dt'), - d.get('manifest'), - ) for d in batch] + rows = [ + ( + d["ingest_type"], + d["base_url"], + bool(d["hit"]), + d["status"], + d.get("platform_name"), + d.get("platform_domain"), + d.get("platform_id"), + d.get("ingest_strategy"), + d.get("total_size"), + d.get("file_count"), + d.get("archiveorg_item_name"), + d.get("archiveorg_item_bundle_path"), + d.get("web_bundle_url"), + d.get("web_bundle_dt"), + d.get("manifest"), + ) + for d in batch + ] # filter out duplicate rows by key (ingest_type, base_url) row_dict = dict() for b in rows: diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 86e3ff2..fbe8066 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -4,25 +4,38 @@ from typing import Optional, Tuple import internetarchive import requests -from sandcrawler.fileset_types import (FilesetManifestFile, FilesetPlatformItem, IngestStrategy, - PlatformRestrictedError, PlatformScopeError) +from sandcrawler.fileset_types import ( + FilesetManifestFile, + FilesetPlatformItem, + IngestStrategy, + PlatformRestrictedError, + PlatformScopeError, +) from sandcrawler.html_metadata import BiblioMetadata from sandcrawler.ia import ResourceResult -class FilesetPlatformHelper(): +class FilesetPlatformHelper: def __init__(self): - self.platform_name = 'unknown' - - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + self.platform_name = "unknown" + + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: """ Does this request look like it matches this platform? """ raise NotImplementedError() - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) """ @@ -47,7 +60,7 @@ class FilesetPlatformHelper(): class DataverseHelper(FilesetPlatformHelper): def __init__(self): super().__init__() - self.platform_name = 'dataverse' + self.platform_name = "dataverse" self.session = requests.Session() @staticmethod @@ -69,16 +82,16 @@ class DataverseHelper(FilesetPlatformHelper): If there is an error parsing, raises a ValueError """ id_type = None - if pid.startswith('doi:10.'): - id_type = 'doi' + if pid.startswith("doi:10."): + id_type = "doi" pid = pid[4:] - elif pid.startswith('hdl:'): - id_type = 'hdl' + elif pid.startswith("hdl:"): + id_type = "hdl" pid = pid[4:] else: raise ValueError(f"unknown dataverse persistentId format: {pid}") - comp = pid.split('/') + comp = pid.split("/") if len(comp) < 2: raise ValueError(f"unknown dataverse persistentId format: {pid}") @@ -114,19 +127,23 @@ class DataverseHelper(FilesetPlatformHelper): "file_id": file_id, } - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] # TODO: could also do HTML platform detection or something? components = urllib.parse.urlparse(url) # platform_domain = components.netloc.split(':')[0].lower() params = urllib.parse.parse_qs(components.query) - id_param = params.get('persistentId') + id_param = params.get("persistentId") if not id_param: return False platform_id = id_param[0] @@ -138,8 +155,12 @@ class DataverseHelper(FilesetPlatformHelper): return True - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) @@ -150,17 +171,17 @@ class DataverseHelper(FilesetPlatformHelper): if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] # 1. extract domain, PID, and version from URL components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() + platform_domain = components.netloc.split(":")[0].lower() params = urllib.parse.parse_qs(components.query) - id_param = params.get('persistentId') + id_param = params.get("persistentId") if not (id_param and id_param[0]): raise PlatformScopeError("Expected a Dataverse persistentId in URL") platform_id = id_param[0] - version_param = params.get('version') + version_param = params.get("version") dataset_version = None if version_param: dataset_version = version_param[0] @@ -170,10 +191,11 @@ class DataverseHelper(FilesetPlatformHelper): except ValueError: raise PlatformScopeError("not actually in scope") - if parsed_id['file_id']: + if parsed_id["file_id"]: # TODO: maybe we could support this? raise PlatformScopeError( - "only entire dataverse datasets can be archived with this tool") + "only entire dataverse datasets can be archived with this tool" + ) # 1b. if we didn't get a version number from URL, fetch it from API if not dataset_version: @@ -182,8 +204,10 @@ class DataverseHelper(FilesetPlatformHelper): ) resp.raise_for_status() obj = resp.json() - obj_latest = obj['data']['latestVersion'] - dataset_version = f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" + obj_latest = obj["data"]["latestVersion"] + dataset_version = ( + f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" + ) # 2. API fetch resp = self.session.get( @@ -192,69 +216,72 @@ class DataverseHelper(FilesetPlatformHelper): resp.raise_for_status() obj = resp.json() - obj_latest = obj['data']['latestVersion'] - assert dataset_version == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" - assert platform_id == obj_latest['datasetPersistentId'] + obj_latest = obj["data"]["latestVersion"] + assert ( + dataset_version + == f"{obj_latest['versionNumber']}.{obj_latest['versionMinorNumber']}" + ) + assert platform_id == obj_latest["datasetPersistentId"] manifest = [] - for row in obj_latest['files']: - df = row['dataFile'] - df_persistent_id = df['persistentId'] + for row in obj_latest["files"]: + df = row["dataFile"] + df_persistent_id = df["persistentId"] platform_url = f"https://{platform_domain}/api/access/datafile/:persistentId/?persistentId={df_persistent_id}" - if df.get('originalFileName'): - platform_url += '&format=original' + if df.get("originalFileName"): + platform_url += "&format=original" extra = dict() # TODO: always save the version field? - if row.get('version') != 1: - extra['version'] = row['version'] - if 'description' in df: - extra['description'] = df['description'] + if row.get("version") != 1: + extra["version"] = row["version"] + if "description" in df: + extra["description"] = df["description"] manifest.append( FilesetManifestFile( - path=df.get('originalFileName') or df['filename'], - size=df.get('originalFileSize') or df['filesize'], - md5=df['md5'], + path=df.get("originalFileName") or df["filename"], + size=df.get("originalFileSize") or df["filesize"], + md5=df["md5"], # NOTE: don't get: sha1, sha256 - mimetype=df['contentType'], + mimetype=df["contentType"], platform_url=platform_url, extra=extra or None, - )) + ) + ) - platform_sub_id = platform_id.split('/')[-1] + platform_sub_id = platform_id.split("/")[-1] archiveorg_item_name = f"{platform_domain}-{platform_sub_id}-v{dataset_version}" archiveorg_item_meta = dict( # TODO: collection=platform_domain, collection="datasets", - date=obj_latest['releaseTime'].split('T')[0], - source= - f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}", + date=obj_latest["releaseTime"].split("T")[0], + source=f"https://{platform_domain}/dataset.xhtml?persistentId={platform_id}&version={dataset_version}", ) - if platform_id.startswith('doi:10.'): - archiveorg_item_meta['doi'] = platform_id.replace('doi:', '') - for block in obj_latest['metadataBlocks']['citation']['fields']: - if block['typeName'] == 'title': - archiveorg_item_meta['title'] = block['value'] - elif block['typeName'] == 'depositor': - archiveorg_item_meta['creator'] = block['value'] - elif block['typeName'] == 'dsDescription': - archiveorg_item_meta['description'] = block['value'][0]['dsDescriptionValue'][ - 'value'] - - archiveorg_item_meta['description'] = archiveorg_item_meta.get('description', '') - if obj_latest.get('termsOfUse'): - archiveorg_item_meta['description'] += '\n<br>\n' + obj_latest['termsOfUse'] + if platform_id.startswith("doi:10."): + archiveorg_item_meta["doi"] = platform_id.replace("doi:", "") + for block in obj_latest["metadataBlocks"]["citation"]["fields"]: + if block["typeName"] == "title": + archiveorg_item_meta["title"] = block["value"] + elif block["typeName"] == "depositor": + archiveorg_item_meta["creator"] = block["value"] + elif block["typeName"] == "dsDescription": + archiveorg_item_meta["description"] = block["value"][0]["dsDescriptionValue"][ + "value" + ] + + archiveorg_item_meta["description"] = archiveorg_item_meta.get("description", "") + if obj_latest.get("termsOfUse"): + archiveorg_item_meta["description"] += "\n<br>\n" + obj_latest["termsOfUse"] return FilesetPlatformItem( platform_name=self.platform_name, - platform_status='success', + platform_status="success", manifest=manifest, platform_domain=platform_domain, platform_id=platform_id, archiveorg_item_name=archiveorg_item_name, archiveorg_item_meta=archiveorg_item_meta, - web_bundle_url= - f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original", + web_bundle_url=f"https://{platform_domain}/api/access/dataset/:persistentId/?persistentId={platform_id}&format=original", # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files) extra=dict(version=dataset_version), ) @@ -301,7 +328,7 @@ def test_parse_dataverse_persistentid() -> None: } invalid = [ - #"doi:10.5072/FK2/J8SJZB/LL6WXZ", + # "doi:10.5072/FK2/J8SJZB/LL6WXZ", "doi:10.25625/abcd", "other:10.25625/LL6WXZ", "10.25625/LL6WXZ", @@ -322,7 +349,7 @@ def test_parse_dataverse_persistentid() -> None: class FigshareHelper(FilesetPlatformHelper): def __init__(self): super().__init__() - self.platform_name = 'figshare' + self.platform_name = "figshare" self.session = requests.Session() @staticmethod @@ -337,13 +364,13 @@ class FigshareHelper(FilesetPlatformHelper): # eg: /articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1 # /articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4 - comp = path.split('/') - if len(comp) < 4 or comp[1] != 'articles': + comp = path.split("/") + if len(comp) < 4 or comp[1] != "articles": raise ValueError(f"not a figshare URL: {path}") comp = comp[2:] if comp[0] in [ - 'dataset', + "dataset", ]: comp = comp[1:] @@ -354,19 +381,23 @@ class FigshareHelper(FilesetPlatformHelper): else: raise ValueError(f"couldn't find figshare identiier: {path}") - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() + platform_domain = components.netloc.split(":")[0].lower() # only work with full, versioned figshare.com URLs - if 'figshare.com' not in platform_domain: + if "figshare.com" not in platform_domain: return False try: @@ -380,8 +411,12 @@ class FigshareHelper(FilesetPlatformHelper): return False - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) """ @@ -389,15 +424,16 @@ class FigshareHelper(FilesetPlatformHelper): if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] # 1. extract domain, PID, and version from URL components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() + platform_domain = components.netloc.split(":")[0].lower() (platform_id, dataset_version) = self.parse_figshare_url_path(components.path) assert platform_id.isdigit(), f"expected numeric: {platform_id}" - assert dataset_version and dataset_version.isdigit( + assert ( + dataset_version and dataset_version.isdigit() ), f"expected numeric: {dataset_version}" # 1b. if we didn't get a version number from URL, fetch it from API @@ -405,59 +441,60 @@ class FigshareHelper(FilesetPlatformHelper): # 2. API fetch resp = self.session.get( - f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}") + f"https://api.figshare.com/v2/articles/{platform_id}/versions/{dataset_version}" + ) resp.raise_for_status() obj = resp.json() # figshare_type = obj['defined_type_name'] - if not obj['is_public']: - raise PlatformRestrictedError(f'record not public: {platform_id} {dataset_version}') - if obj['is_embargoed']: + if not obj["is_public"]: + raise PlatformRestrictedError(f"record not public: {platform_id} {dataset_version}") + if obj["is_embargoed"]: raise PlatformRestrictedError( f'record is embargoed: {obj.get("embargo_title")} ({platform_id} {dataset_version})' ) manifest = [] - for row in obj['files']: + for row in obj["files"]: manifest.append( FilesetManifestFile( - path=row['name'], - size=row['size'], - md5=row['computed_md5'], + path=row["name"], + size=row["size"], + md5=row["computed_md5"], # NOTE: don't get: sha1, sha256, mimetype - platform_url=row['download_url'], - #extra=dict(), - )) - assert not row.get('is_link_only') + platform_url=row["download_url"], + # extra=dict(), + ) + ) + assert not row.get("is_link_only") authors = [] - for author in obj['authors']: - authors.append(author['full_name']) + for author in obj["authors"]: + authors.append(author["full_name"]) archiveorg_item_name = f"{platform_domain}-{platform_id}-v{dataset_version}" archiveorg_item_meta = dict( # TODO: collection=platform_domain, collection="datasets", creator=authors, - doi=obj['doi'], - title=obj['title'], - date=obj['published_date'], - source=obj['url_public_html'], - description=obj['description'], - license=obj['license']['url'], - version=obj['version'], + doi=obj["doi"], + title=obj["title"], + date=obj["published_date"], + source=obj["url_public_html"], + description=obj["description"], + license=obj["license"]["url"], + version=obj["version"], ) return FilesetPlatformItem( platform_name=self.platform_name, - platform_status='success', + platform_status="success", manifest=manifest, platform_domain=platform_domain, platform_id=platform_id, archiveorg_item_name=archiveorg_item_name, archiveorg_item_meta=archiveorg_item_meta, - web_bundle_url= - f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", + web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files) extra=dict(version=dataset_version), ) @@ -466,13 +503,19 @@ class FigshareHelper(FilesetPlatformHelper): def test_parse_figshare_url_path() -> None: valid = { - "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": - ("8987858", "1"), - "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": - ("8987858", None), + "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": ( + "8987858", + "1", + ), + "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858": ( + "8987858", + None, + ), "/articles/CIBERSORT_p-value_0_05/8217188/1": ("8217188", "1"), - "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": - ("12127176", "4"), + "/articles/dataset/STable_1_U-Pb_geochronologic_analyses_on_samples_xls/12127176/4": ( + "12127176", + "4", + ), } invalid = [ @@ -493,25 +536,33 @@ def test_parse_figshare_url_path() -> None: class ZenodoHelper(FilesetPlatformHelper): def __init__(self): super().__init__() - self.platform_name = 'zenodo' + self.platform_name = "zenodo" self.session = requests.Session() - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() - if platform_domain == 'zenodo.org' and '/record/' in components.path: + platform_domain = components.netloc.split(":")[0].lower() + if platform_domain == "zenodo.org" and "/record/" in components.path: return True return False - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) """ @@ -519,7 +570,7 @@ class ZenodoHelper(FilesetPlatformHelper): if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] # TODO: also look in base_url and resource-non-terminal for ident? to # check for work-level redirects @@ -527,118 +578,118 @@ class ZenodoHelper(FilesetPlatformHelper): # 1. extract identifier from URL # eg: https://zenodo.org/record/5230255 components = urllib.parse.urlparse(url) - platform_domain = components.netloc.split(':')[0].lower() - if len(components.path.split('/')) < 2: + platform_domain = components.netloc.split(":")[0].lower() + if len(components.path.split("/")) < 2: raise PlatformScopeError("Expected a complete, versioned figshare URL") - platform_id = components.path.split('/')[2] + platform_id = components.path.split("/")[2] assert platform_id.isdigit(), f"expected numeric: {platform_id}" - if 'zenodo.org' not in platform_domain: + if "zenodo.org" not in platform_domain: raise PlatformScopeError(f"unexpected zenodo.org domain: {platform_domain}") # 2. API fetch resp = self.session.get(f"https://zenodo.org/api/records/{platform_id}") if resp.status_code == 410: - raise PlatformRestrictedError('record deleted') + raise PlatformRestrictedError("record deleted") resp.raise_for_status() obj = resp.json() - assert obj['id'] == int(platform_id) - work_id = obj['conceptrecid'] - if work_id == obj['id']: + assert obj["id"] == int(platform_id) + work_id = obj["conceptrecid"] + if work_id == obj["id"]: raise PlatformScopeError( - "got a work-level zenodo record, not a versioned record: {work_id}") + "got a work-level zenodo record, not a versioned record: {work_id}" + ) # zenodo_type = obj['metadata']['resource_type']['type'] - if obj['metadata']['access_right'] != 'open': + if obj["metadata"]["access_right"] != "open": raise PlatformRestrictedError( "not publicly available ({obj['metadata']['access_right']}): {platform_domain} {platform_id}" ) manifest = [] - for row in obj['files']: + for row in obj["files"]: mf = FilesetManifestFile( - path=row['key'], - size=row['size'], - platform_url=row['links']['self'], - #extra=dict(), + path=row["key"], + size=row["size"], + platform_url=row["links"]["self"], + # extra=dict(), ) - checksum = row['checksum'] + checksum = row["checksum"] # eg: md5:35ffcab905f8224556dba76648cb7dad - if checksum.startswith('md5:'): + if checksum.startswith("md5:"): mf.md5 = checksum[4:] - elif checksum.startswith('sha1:'): + elif checksum.startswith("sha1:"): mf.sha1 = checksum[45] manifest.append(mf) authors = [] - for author in obj['metadata']['creators']: - authors.append(author['name']) + for author in obj["metadata"]["creators"]: + authors.append(author["name"]) archiveorg_item_name = f"{platform_domain}-{platform_id}" archiveorg_item_meta = dict( # TODO: collection=platform_domain, collection="datasets", creator=authors, - doi=obj['doi'], - title=obj['metadata']['title'], - date=obj['metadata']['publication_date'], - source=obj['links']['html'], - description=obj['metadata']['description'], - license=obj['metadata']['license']['id'], - version=obj['revision'], + doi=obj["doi"], + title=obj["metadata"]["title"], + date=obj["metadata"]["publication_date"], + source=obj["links"]["html"], + description=obj["metadata"]["description"], + license=obj["metadata"]["license"]["id"], + version=obj["revision"], # obj['metadata']['version'] is, eg, git version tag ) return FilesetPlatformItem( platform_name=self.platform_name, - platform_status='success', + platform_status="success", manifest=manifest, platform_domain=platform_domain, platform_id=platform_id, archiveorg_item_name=archiveorg_item_name, archiveorg_item_meta=archiveorg_item_meta, - #web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", + # web_bundle_url=f"https://ndownloader.figshare.com/articles/{platform_id}/versions/{dataset_version}", # TODO: web_base_url= (for GWB downloading, in lieu of platform_url on individual files) - extra=dict(version=obj['revision']), + extra=dict(version=obj["revision"]), ) class ArchiveOrgHelper(FilesetPlatformHelper): FORMAT_TO_MIMETYPE = { - 'BZIP': 'application/x-bzip', - 'BZIP2': 'application/x-bzip2', - 'ZIP': 'application/zip', - 'GZIP': 'application/gzip', - 'RAR': 'application/vnd.rar', - 'TAR': 'application/x-tar', - '7z': 'application/x-7z-compressed', - 'HTML': 'text/html', - 'Text': 'text/plain', - 'PDF': 'application/pdf', - 'CSV': 'text/csv', - 'XML': 'application/xml', - 'JSON': 'application/json', - + "BZIP": "application/x-bzip", + "BZIP2": "application/x-bzip2", + "ZIP": "application/zip", + "GZIP": "application/gzip", + "RAR": "application/vnd.rar", + "TAR": "application/x-tar", + "7z": "application/x-7z-compressed", + "HTML": "text/html", + "Text": "text/plain", + "PDF": "application/pdf", + "CSV": "text/csv", + "XML": "application/xml", + "JSON": "application/json", #'application/msword (.doc)', # .doc #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx #'application/vnd.ms-excel', # .xls #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx - 'MP3': 'audio/mpeg', # .mp3 - 'MP4': 'video/mp4', # .mp4 - 'MPEG': 'video/mpeg', # .mpeg - 'JPEG': 'image/jpeg', - 'GIF': 'image/gif', - 'PNG': 'image/png', - 'TIFF': 'image/tiff', - 'Unknown': None, + "MP3": "audio/mpeg", # .mp3 + "MP4": "video/mp4", # .mp4 + "MPEG": "video/mpeg", # .mpeg + "JPEG": "image/jpeg", + "GIF": "image/gif", + "PNG": "image/png", + "TIFF": "image/tiff", + "Unknown": None, } def __init__(self): super().__init__() - self.platform_name = 'archiveorg' + self.platform_name = "archiveorg" self.session = internetarchive.get_session() @staticmethod @@ -646,69 +697,79 @@ class ArchiveOrgHelper(FilesetPlatformHelper): """ Filters IA API files """ - if f.source != 'original': + if f.source != "original": return False for suffix in [ - '_meta.sqlite', - '_archive.torrent', - '_itemimage.jpg', - '_meta.xml', - '_thumb.png', - '_files.xml', + "_meta.sqlite", + "_archive.torrent", + "_itemimage.jpg", + "_meta.xml", + "_thumb.png", + "_files.xml", ]: if f.name == item_name + suffix or f.name == item_name.lower() + suffix: return False - if f.name.startswith('_'): + if f.name.startswith("_"): return False - if item_name.startswith('academictorrents_'): + if item_name.startswith("academictorrents_"): for suffix in [ - '_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib' + "_academictorrents.torrent", + "_academictorrents_torrent.txt", + ".bib", ]: if f.name == item_name + suffix: return False return True - def match_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> bool: + def match_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> bool: if resource and resource.terminal_url: url = resource.terminal_url else: - url = request['base_url'] + url = request["base_url"] patterns = [ - '://archive.org/details/', - '://archive.org/download/', + "://archive.org/details/", + "://archive.org/download/", ] for p in patterns: if p in url: return True return False - def process_request(self, request: dict, resource: Optional[ResourceResult], - html_biblio: Optional[BiblioMetadata]) -> FilesetPlatformItem: + def process_request( + self, + request: dict, + resource: Optional[ResourceResult], + html_biblio: Optional[BiblioMetadata], + ) -> FilesetPlatformItem: """ Fetch platform-specific metadata for this request (eg, via API calls) """ - base_url_split = request['base_url'].split('/') - #print(base_url_split, file=sys.stderr) + base_url_split = request["base_url"].split("/") + # print(base_url_split, file=sys.stderr) assert len(base_url_split) in [5, 6] - assert base_url_split[0] in ['http:', 'https:'] - assert base_url_split[2] == 'archive.org' - assert base_url_split[3] in ['details', 'download'] + assert base_url_split[0] in ["http:", "https:"] + assert base_url_split[2] == "archive.org" + assert base_url_split[3] in ["details", "download"] item_name = base_url_split[4] if len(base_url_split) == 6 and base_url_split[5]: raise PlatformScopeError( "got an archive.org file path, not download/details page; individual files not handled yet" ) - #print(f" archiveorg processing item={item_name}", file=sys.stderr) + # print(f" archiveorg processing item={item_name}", file=sys.stderr) item = self.session.get_item(item_name) item_name = item.identifier - item_collection = item.metadata['collection'] + item_collection = item.metadata["collection"] if type(item_collection) == list: item_collection = item_collection[0] - assert item.metadata['mediatype'] not in ['collection', 'web'] + assert item.metadata["mediatype"] not in ["collection", "web"] item_files = item.get_files(on_the_fly=False) item_files = [f for f in item_files if self.want_item_file(f, item_name)] manifest = [] @@ -727,9 +788,9 @@ class ArchiveOrgHelper(FilesetPlatformHelper): return FilesetPlatformItem( platform_name=self.platform_name, - platform_status='success', + platform_status="success", manifest=manifest, - platform_domain='archive.org', + platform_domain="archive.org", platform_id=item_name, archiveorg_item_name=item_name, archiveorg_meta=dict(collection=item_collection), diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index 9d3bae3..6dc77f9 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -5,15 +5,19 @@ from typing import Optional import internetarchive -from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetPlatformItem, - IngestStrategy, PlatformScopeError) +from sandcrawler.fileset_types import ( + ArchiveStrategyResult, + FilesetPlatformItem, + IngestStrategy, + PlatformScopeError, +) from sandcrawler.ia import SavePageNowClient, WaybackClient, fix_transfer_encoding from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path -class FilesetIngestStrategy(): +class FilesetIngestStrategy: def __init__(self): - #self.ingest_strategy = 'unknown' + # self.ingest_strategy = 'unknown' self.success_status = "success" def check_existing(self, item: FilesetPlatformItem) -> Optional[ArchiveStrategyResult]: @@ -29,8 +33,8 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): self.ingest_strategy = IngestStrategy.ArchiveorgFileset # TODO: enable cleanup when confident (eg, safe path parsing) - self.skip_cleanup_local_files = kwargs.get('skip_cleanup_local_files', True) - self.working_dir = os.environ.get('SANDCRAWLER_WORKING_DIR', '/tmp/sandcrawler/') + self.skip_cleanup_local_files = kwargs.get("skip_cleanup_local_files", True) + self.working_dir = os.environ.get("SANDCRAWLER_WORKING_DIR", "/tmp/sandcrawler/") try: os.mkdir(self.working_dir) except FileExistsError: @@ -53,23 +57,29 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): found = False for existing in item_files: if existing.name == wanted.path: - if ((existing.sha1 and existing.sha1 == wanted.sha1) or - (existing.md5 and existing.md5 == wanted.md5) - ) and existing.name == wanted.path and existing.size == wanted.size: + if ( + ( + (existing.sha1 and existing.sha1 == wanted.sha1) + or (existing.md5 and existing.md5 == wanted.md5) + ) + and existing.name == wanted.path + and existing.size == wanted.size + ): found = True - wanted.status = 'exists' + wanted.status = "exists" break else: - wanted.status = 'mismatch-existing' + wanted.status = "mismatch-existing" break if not found: print( f" item exists ({item.archiveorg_item_name}) but didn't find at least one file: {wanted.path}", - file=sys.stderr) + file=sys.stderr, + ) return None return ArchiveStrategyResult( ingest_strategy=self.ingest_strategy, - status='success-existing', + status="success-existing", manifest=item.manifest, ) @@ -81,12 +91,12 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): if existing: return existing - if item.platform_name == 'archiveorg': + if item.platform_name == "archiveorg": raise PlatformScopeError("should't download archive.org into itself") local_dir = self.working_dir + item.archiveorg_item_name - assert local_dir.startswith('/') - assert local_dir.count('/') > 2 + assert local_dir.startswith("/") + assert local_dir.count("/") > 2 try: os.mkdir(local_dir) except FileExistsError: @@ -96,71 +106,80 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): assert item.manifest for m in item.manifest: if m.path != sanitize_fs_path(m.path): - m.status = 'unsafe-path' + m.status = "unsafe-path" continue - local_path = local_dir + '/' + m.path + local_path = local_dir + "/" + m.path assert m.platform_url if not os.path.exists(local_path): print(f" downloading {m.path}", file=sys.stderr) - with self.ia_session.get(m.platform_url, stream=True, - allow_redirects=True) as r: + with self.ia_session.get( + m.platform_url, stream=True, allow_redirects=True + ) as r: r.raise_for_status() - with open(local_path + '.partial', 'wb') as f: + with open(local_path + ".partial", "wb") as f: for chunk in r.iter_content(chunk_size=256 * 1024): f.write(chunk) - os.rename(local_path + '.partial', local_path) - m.status = 'downloaded-local' + os.rename(local_path + ".partial", local_path) + m.status = "downloaded-local" else: - m.status = 'exists-local' + m.status = "exists-local" print(f" verifying {m.path}", file=sys.stderr) file_meta = gen_file_metadata_path(local_path, allow_empty=True) - assert file_meta[ - 'size_bytes'] == m.size, f"expected: {m.size} found: {file_meta['size_bytes']}" + assert ( + file_meta["size_bytes"] == m.size + ), f"expected: {m.size} found: {file_meta['size_bytes']}" if m.sha1: - assert file_meta['sha1hex'] == m.sha1 + assert file_meta["sha1hex"] == m.sha1 else: - m.sha1 = file_meta['sha1hex'] + m.sha1 = file_meta["sha1hex"] if m.sha256: - assert file_meta['sha256hex'] == m.sha256 + assert file_meta["sha256hex"] == m.sha256 else: - m.sha256 = file_meta['sha256hex'] + m.sha256 = file_meta["sha256hex"] if m.md5: - assert file_meta['md5hex'] == m.md5 + assert file_meta["md5hex"] == m.md5 else: - m.md5 = file_meta['md5hex'] + m.md5 = file_meta["md5hex"] if m.mimetype: # 'magic' isn't good and parsing more detailed text file formats like text/csv - if file_meta['mimetype'] != m.mimetype and file_meta['mimetype'] != 'text/plain': + if ( + file_meta["mimetype"] != m.mimetype + and file_meta["mimetype"] != "text/plain" + ): # these 'tab-separated-values' from dataverse are just noise, don't log them - if m.mimetype != 'text/tab-separated-values': + if m.mimetype != "text/tab-separated-values": print( f" WARN: mimetype mismatch: expected {m.mimetype}, found {file_meta['mimetype']}", - file=sys.stderr) - m.mimetype = file_meta['mimetype'] + file=sys.stderr, + ) + m.mimetype = file_meta["mimetype"] else: - m.mimetype = file_meta['mimetype'] - m.status = 'verified-local' + m.mimetype = file_meta["mimetype"] + m.status = "verified-local" # 2. upload all files, with metadata - assert item.archiveorg_item_meta and item.archiveorg_item_meta['collection'] + assert item.archiveorg_item_meta and item.archiveorg_item_meta["collection"] item_files = [] for m in item.manifest: - local_path = local_dir + '/' + m.path - item_files.append({ - 'name': local_path, - 'remote_name': m.path, - }) + local_path = local_dir + "/" + m.path + item_files.append( + { + "name": local_path, + "remote_name": m.path, + } + ) print( f" uploading all files to {item.archiveorg_item_name} under {item.archiveorg_item_meta.get('collection')}...", - file=sys.stderr) + file=sys.stderr, + ) internetarchive.upload( item.archiveorg_item_name, files=item_files, @@ -171,7 +190,7 @@ class ArchiveorgFilesetStrategy(FilesetIngestStrategy): ) for m in item.manifest: - m.status = 'success' + m.status = "success" # 4. delete local directory if not self.skip_cleanup_local_files: @@ -191,6 +210,7 @@ class ArchiveorgFileStrategy(ArchiveorgFilesetStrategy): ArchiveorgFilesetStrategy currently works fine with individual files. Just need to over-ride the ingest_strategy name. """ + def __init__(self): super().__init__() self.ingest_strategy = IngestStrategy.ArchiveorgFileset @@ -204,7 +224,8 @@ class WebFilesetStrategy(FilesetIngestStrategy): self.wayback_client = WaybackClient() self.try_spn2 = True self.spn_client = SavePageNowClient( - spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) + spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0) + ) self.max_spn_manifest = 20 def process(self, item: FilesetPlatformItem) -> ArchiveStrategyResult: @@ -222,25 +243,31 @@ class WebFilesetStrategy(FilesetIngestStrategy): fetch_url = m.platform_url if not fetch_url: raise NotImplementedError( - "require 'platform_url' for each file when doing Web fetching") + "require 'platform_url' for each file when doing Web fetching" + ) via = "wayback" resource = self.wayback_client.lookup_resource(fetch_url, m.mimetype) - if self.try_spn2 and (resource is None or - (resource and resource.status == 'no-capture')): + if self.try_spn2 and ( + resource is None or (resource and resource.status == "no-capture") + ): if len(item.manifest) > self.max_spn_manifest: - m.status = 'too-much-spn' + m.status = "too-much-spn" continue via = "spn2" - resource = self.spn_client.crawl_resource(fetch_url, - self.wayback_client, - force_simple_get=True) - - print("[FETCH {:>6}] {} {}".format(via, (resource and resource.status), - (resource and resource.terminal_url) - or fetch_url), - file=sys.stderr) + resource = self.spn_client.crawl_resource( + fetch_url, self.wayback_client, force_simple_get=True + ) + + print( + "[FETCH {:>6}] {} {}".format( + via, + (resource and resource.status), + (resource and resource.terminal_url) or fetch_url, + ), + file=sys.stderr, + ) m.terminal_url = resource.terminal_url m.terminal_dt = resource.terminal_dt @@ -248,7 +275,7 @@ class WebFilesetStrategy(FilesetIngestStrategy): if self.ingest_strategy == "web-file": file_resource = resource - if resource.status != 'success': + if resource.status != "success": continue else: assert resource.terminal_status_code == 200 @@ -259,24 +286,26 @@ class WebFilesetStrategy(FilesetIngestStrategy): if self.ingest_strategy == "web-file": file_file_meta = file_meta - if file_meta['size_bytes'] != m.size or (m.md5 and m.md5 != file_meta['md5hex'] - ) or (m.sha1 - and m.sha1 != file_meta['sha1hex']): - m.status = 'mismatch' + if ( + file_meta["size_bytes"] != m.size + or (m.md5 and m.md5 != file_meta["md5hex"]) + or (m.sha1 and m.sha1 != file_meta["sha1hex"]) + ): + m.status = "mismatch" continue - m.md5 = m.md5 or file_meta['md5hex'] - m.sha1 = m.sha1 or file_meta['md5hex'] - m.sha256 = m.sha256 or file_meta['sha256hex'] - m.mimetype = m.mimetype or file_meta['mimetype'] + m.md5 = m.md5 or file_meta["md5hex"] + m.sha1 = m.sha1 or file_meta["md5hex"] + m.sha256 = m.sha256 or file_meta["sha256hex"] + m.mimetype = m.mimetype or file_meta["mimetype"] overall_status = self.success_status for m in item.manifest: - if m.status != 'success': - overall_status = m.status or 'not-processed' + if m.status != "success": + overall_status = m.status or "not-processed" break if not item.manifest: - overall_status = 'empty-manifest' + overall_status = "empty-manifest" result = ArchiveStrategyResult( ingest_strategy=self.ingest_strategy, diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py index f543ede..3398833 100644 --- a/python/sandcrawler/fileset_types.py +++ b/python/sandcrawler/fileset_types.py @@ -62,6 +62,7 @@ class PlatformScopeError(Exception): - a 'latest version' record, when the platform has version-specific records - a single file within a dataset for a platform which has file-level identifiers """ + pass @@ -69,4 +70,5 @@ class PlatformRestrictedError(Exception): """ When datasets are not publicly available on a platform (yet) """ + pass diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 67aca17..26918f6 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -12,11 +12,11 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker class GrobidClient(object): def __init__(self, host_url: str = "http://grobid.qa.fatcat.wiki", **kwargs): self.host_url = host_url - self.consolidate_mode = int(kwargs.get('consolidate_mode', 0)) + self.consolidate_mode = int(kwargs.get("consolidate_mode", 0)) - def process_fulltext(self, - blob: bytes, - consolidate_mode: Optional[int] = None) -> Dict[str, Any]: + def process_fulltext( + self, blob: bytes, consolidate_mode: Optional[int] = None + ) -> Dict[str, Any]: """ Returns dict with keys: - status_code @@ -36,72 +36,75 @@ class GrobidClient(object): grobid_response = requests.post( self.host_url + "/api/processFulltextDocument", files={ - 'input': blob, - 'consolidateHeader': consolidate_mode, - 'consolidateCitations': 0, # too expensive for now - 'includeRawCitations': 1, + "input": blob, + "consolidateHeader": consolidate_mode, + "consolidateCitations": 0, # too expensive for now + "includeRawCitations": 1, }, timeout=180.0, ) except requests.Timeout: return { - 'status': 'error-timeout', - 'status_code': -4, # heritrix3 "HTTP timeout" code - 'error_msg': 'GROBID request (HTTP POST) timeout', + "status": "error-timeout", + "status_code": -4, # heritrix3 "HTTP timeout" code + "error_msg": "GROBID request (HTTP POST) timeout", } info: Dict[str, Any] = dict(status_code=grobid_response.status_code) if grobid_response.status_code == 200: - info['status'] = 'success' - info['tei_xml'] = grobid_response.text - if len(info['tei_xml']) > 12000000: + info["status"] = "success" + info["tei_xml"] = grobid_response.text + if len(info["tei_xml"]) > 12000000: # XML is larger than Kafka message size, and much larger than # an article in general; bail out - info['status'] = 'error' - info['error_msg'] = "response XML too large: {} bytes".format( - len(info['tei_xml'])) - info.pop('tei_xml') + info["status"] = "error" + info["error_msg"] = "response XML too large: {} bytes".format( + len(info["tei_xml"]) + ) + info.pop("tei_xml") else: # response.text is .content decoded as utf-8 - info['status'] = 'error' - info['error_msg'] = grobid_response.text[:10000] + info["status"] = "error" + info["error_msg"] = grobid_response.text[:10000] return info def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]: - if result['status'] != 'success': + if result["status"] != "success": return None - tei_json = teixml2json(result['tei_xml'], encumbered=False) + tei_json = teixml2json(result["tei_xml"], encumbered=False) meta = dict() biblio = dict() for k in ( - 'title', - 'authors', - 'journal', - 'date', - 'doi', + "title", + "authors", + "journal", + "date", + "doi", ): if tei_json.get(k): biblio[k] = tei_json[k] - meta['biblio'] = biblio - for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'): + meta["biblio"] = biblio + for k in ("grobid_version", "grobid_timestamp", "fatcat_release", "language_code"): if tei_json.get(k): meta[k] = tei_json[k] return meta class GrobidWorker(SandcrawlerFetchWorker): - def __init__(self, - grobid_client: GrobidClient, - wayback_client: Optional[WaybackClient] = None, - sink: Optional[SandcrawlerWorker] = None, - **kwargs): + def __init__( + self, + grobid_client: GrobidClient, + wayback_client: Optional[WaybackClient] = None, + sink: Optional[SandcrawlerWorker] = None, + **kwargs + ): super().__init__(wayback_client=wayback_client) self.grobid_client = grobid_client self.sink = sink self.consolidate_mode = 0 def timeout_response(self, task: Any) -> Any: - default_key = task['sha1hex'] + default_key = task["sha1hex"] return dict( status="error-timeout", error_msg="internal GROBID worker timeout", @@ -111,16 +114,17 @@ class GrobidWorker(SandcrawlerFetchWorker): def process(self, record: Any, key: Optional[str] = None) -> Any: fetch_result = self.fetch_blob(record) - if fetch_result['status'] != 'success': + if fetch_result["status"] != "success": return fetch_result - blob: bytes = fetch_result['blob'] + blob: bytes = fetch_result["blob"] assert blob and isinstance(blob, bytes) - result = self.grobid_client.process_fulltext(blob, - consolidate_mode=self.consolidate_mode) - result['file_meta'] = gen_file_metadata(blob) - result['source'] = record - result['key'] = result['file_meta']['sha1hex'] + result = self.grobid_client.process_fulltext( + blob, consolidate_mode=self.consolidate_mode + ) + result["file_meta"] = gen_file_metadata(blob) + result["source"] = record + result["key"] = result["file_meta"]["sha1hex"] return result @@ -129,10 +133,10 @@ class GrobidBlobWorker(SandcrawlerWorker): This is sort of like GrobidWorker, except it receives blobs directly, instead of fetching blobs from some remote store. """ - def __init__(self, - grobid_client: GrobidClient, - sink: Optional[SandcrawlerWorker] = None, - **kwargs): + + def __init__( + self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs + ): super().__init__() self.grobid_client = grobid_client self.sink = sink @@ -141,8 +145,9 @@ class GrobidBlobWorker(SandcrawlerWorker): def process(self, blob: Any, key: Optional[str] = None) -> Any: if not blob: return None - result = self.grobid_client.process_fulltext(blob, - consolidate_mode=self.consolidate_mode) - result['file_meta'] = gen_file_metadata(blob) - result['key'] = result['file_meta']['sha1hex'] + result = self.grobid_client.process_fulltext( + blob, consolidate_mode=self.consolidate_mode + ) + result["file_meta"] = gen_file_metadata(blob) + result["key"] = result["file_meta"]["sha1hex"] return result diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 4d36573..5fba963 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -7,7 +7,8 @@ from typing import Dict from bs4 import BeautifulSoup RESEARCHSQUARE_REGEX = re.compile( - r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"') + r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"' +) IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"') OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";') SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';") @@ -21,9 +22,9 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: On error, or if fails to extract a URL, returns an empty dict. """ - host_prefix = '/'.join(html_url.split('/')[:3]) + host_prefix = "/".join(html_url.split("/")[:3]) try: - soup = BeautifulSoup(html_body, 'html.parser') + soup = BeautifulSoup(html_body, "html.parser") except TypeError as te: print(f"{te} (url={html_url})", file=sys.stderr) return dict() @@ -34,80 +35,86 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: ### General Tricks ### # highwire-style meta tag - meta = soup.find('meta', attrs={"name": "citation_pdf_url"}) + meta = soup.find("meta", attrs={"name": "citation_pdf_url"}) if not meta: - meta = soup.find('meta', attrs={"name": "bepress_citation_pdf_url"}) + meta = soup.find("meta", attrs={"name": "bepress_citation_pdf_url"}) if not meta: - meta = soup.find('meta', attrs={"name": "wkhealth_pdf_url"}) + meta = soup.find("meta", attrs={"name": "wkhealth_pdf_url"}) if not meta: # researchgate does this; maybe others also? - meta = soup.find('meta', attrs={"property": "citation_pdf_url"}) + meta = soup.find("meta", attrs={"property": "citation_pdf_url"}) if not meta: - meta = soup.find('meta', attrs={"name": "eprints.document_url"}) + meta = soup.find("meta", attrs={"name": "eprints.document_url"}) # if tag is only partially populated - if meta and not meta.get('content'): + if meta and not meta.get("content"): meta = None # wiley has a weird almost-blank page we don't want to loop on if meta and "://onlinelibrary.wiley.com/doi/pdf/" not in html_url: - url = meta['content'].strip() - if '://doi.org/' in url: + url = meta["content"].strip() + if "://doi.org/" in url: print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr) - elif url.startswith('/'): + elif url.startswith("/"): if host_prefix + url == html_url: print("\tavoiding citation_pdf_url link-loop", file=sys.stderr) else: - return dict(pdf_url=host_prefix + url, technique='citation_pdf_url') - elif url.startswith('http'): + return dict(pdf_url=host_prefix + url, technique="citation_pdf_url") + elif url.startswith("http"): if url == html_url: print("\tavoiding citation_pdf_url link-loop", file=sys.stderr) else: - return dict(pdf_url=url, technique='citation_pdf_url') + return dict(pdf_url=url, technique="citation_pdf_url") else: print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr) - meta = soup.find('meta', attrs={"name": "generator"}) + meta = soup.find("meta", attrs={"name": "generator"}) meta_generator = None - if meta and meta.get('content'): - meta_generator = meta['content'].strip() + if meta and meta.get("content"): + meta_generator = meta["content"].strip() ### Publisher/Platform Specific ### # research square (researchsquare.com) - if 'researchsquare.com/article/' in html_url: + if "researchsquare.com/article/" in html_url: # JSON in body with a field like: # "url":"https://assets.researchsquare.com/files/4a57970e-b002-4608-b507-b95967649483/v2/Manuscript.pdf" - m = RESEARCHSQUARE_REGEX.search(html_body.decode('utf-8')) + m = RESEARCHSQUARE_REGEX.search(html_body.decode("utf-8")) if m: url = m.group(1) assert len(url) < 4096 - return dict(release_stage="manuscript", pdf_url=url, technique='publisher') + return dict(release_stage="manuscript", pdf_url=url, technique="publisher") # elseiver linking hub # https://linkinghub.elsevier.com/retrieve/pii/S1569199319308975 - if '://linkinghub.elsevier.com/retrieve/pii/' in html_url: + if "://linkinghub.elsevier.com/retrieve/pii/" in html_url: # <input type="hidden" name="redirectURL" value="http%3A%2F%2Fcysticfibrosisjournal.com%2Fretrieve%2Fpii%2FS1569199319308975" id="redirectURL"/> redirect = soup.find("input", attrs={"name": "redirectURL"}) if redirect: - url = redirect['value'].strip() - if 'http' in url: + url = redirect["value"].strip() + if "http" in url: url = urllib.parse.unquote(url) # drop any the query parameter - url = url.split('?via')[0] + url = url.split("?via")[0] return dict(next_url=url, technique="elsevier-linkinghub") # sciencedirect PDF URL extract # https://www.sciencedirect.com/science/article/pii/S0169204621000670 - if 'sciencedirect.com/science/article/pii/' in html_url and not html_url.endswith(".pdf"): + if "sciencedirect.com/science/article/pii/" in html_url and not html_url.endswith(".pdf"): json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"}) url = None if json_tag: try: json_text = json_tag.string json_meta = json.loads(json_text) - pdf_meta = json_meta['article']['pdfDownload']['urlMetadata'] + pdf_meta = json_meta["article"]["pdfDownload"]["urlMetadata"] # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf - url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams'][ - 'md5'] + "&pid=" + pdf_meta['queryParams']['pid'] + url = ( + html_url + + pdf_meta["pdfExtension"] + + "?md5=" + + pdf_meta["queryParams"]["md5"] + + "&pid=" + + pdf_meta["queryParams"]["pid"] + ) except (KeyError, TypeError, json.JSONDecodeError): pass if url: @@ -115,9 +122,9 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # sciencedirect PDF bounce page # https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf - if '://www.sciencedirect.com/' in html_url and html_url.endswith(".pdf"): + if "://www.sciencedirect.com/" in html_url and html_url.endswith(".pdf"): # window.location = 'https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=[...]&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=[...]&hash=[...]&host=[...]&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=[...]&type=client'; - m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode('utf-8')) + m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(html_body.decode("utf-8")) if m: url = m.group(1) assert len(url) < 4000 @@ -125,34 +132,34 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # ieeexplore.ieee.org # https://ieeexplore.ieee.org/document/8730316 - if '://ieeexplore.ieee.org/document/' in html_url: + if "://ieeexplore.ieee.org/document/" in html_url: # JSON in body with a field like: # "pdfPath":"/iel7/6287639/8600701/08730316.pdf", - m = IEEEXPLORE_REGEX.search(html_body.decode('utf-8')) + m = IEEEXPLORE_REGEX.search(html_body.decode("utf-8")) if m: url = m.group(1) assert len(url) < 4096 - return dict(release_stage="published", - pdf_url=host_prefix + url, - technique="ieeexplore") + return dict( + release_stage="published", pdf_url=host_prefix + url, technique="ieeexplore" + ) # https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313 - if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url: + if "://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber" in html_url: # HTML iframe like: # <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&arnumber=8730313&isnumber=8600701&ref=" frameborder="0"></iframe> iframe = soup.find("iframe") - if iframe and '.pdf' in iframe['src']: - return dict(pdf_url=iframe['src'], technique="iframe") + if iframe and ".pdf" in iframe["src"]: + return dict(pdf_url=iframe["src"], technique="iframe") # https://insights.ovid.com/crossref?an=00042307-202001000-00013 # Ovid is some kind of landing page bounce portal tracking run-around. # Can extract actual journal URL from javascript blob in the HTML - if '://insights.ovid.com/crossref' in html_url: + if "://insights.ovid.com/crossref" in html_url: # var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"; - m = OVID_JOURNAL_URL_REGEX.search(html_body.decode('utf-8')) + m = OVID_JOURNAL_URL_REGEX.search(html_body.decode("utf-8")) if m: url = m.group(1) assert len(url) < 4096 - return dict(next_url=url, technique='ovid') + return dict(next_url=url, technique="ovid") # osf.io # https://osf.io/8phvx/ @@ -160,41 +167,44 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # wow, they ship total javascript crud! going to just guess download URL # based on URL for now. Maybe content type header would help? OSF_DOMAINS = [ - '://osf.io/', - '://biohackrxiv.org/', - '://psyarxiv.com/', - '://arabixiv.org/', - '://engrxiv.org/', - '://edarxiv.org//', - '://ecsarxiv.org/', - '://ecoevorxiv.org/', - '://frenxiv.org/', - '://indiarxiv.org/', - '://mindrxiv.org/', - '://mediarxiv.org/', - '://paleorxiv.org/', - '://thesiscommons.org/', + "://osf.io/", + "://biohackrxiv.org/", + "://psyarxiv.com/", + "://arabixiv.org/", + "://engrxiv.org/", + "://edarxiv.org//", + "://ecsarxiv.org/", + "://ecoevorxiv.org/", + "://frenxiv.org/", + "://indiarxiv.org/", + "://mindrxiv.org/", + "://mediarxiv.org/", + "://paleorxiv.org/", + "://thesiscommons.org/", ] for domain in OSF_DOMAINS: - if domain in html_url and (len(html_url.split('/')) in [4, 5] or '/preprints/' - in html_url) and '/download' not in html_url: + if ( + domain in html_url + and (len(html_url.split("/")) in [4, 5] or "/preprints/" in html_url) + and "/download" not in html_url + ): if not html_url.endswith("/"): next_url = html_url + "/download" else: next_url = html_url + "download" - return dict(next_url=next_url, technique='osf-by-url') + return dict(next_url=next_url, technique="osf-by-url") # wiley # https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787 if "://onlinelibrary.wiley.com/doi/pdf/" in html_url: if b"/doi/pdfdirect/" in html_body: - next_url = html_url.replace('/doi/pdf/', '/doi/pdfdirect/') - return dict(next_url=next_url, technique='wiley-pdfdirect') + next_url = html_url.replace("/doi/pdf/", "/doi/pdfdirect/") + return dict(next_url=next_url, technique="wiley-pdfdirect") # arxiv abstract pages if "://arxiv.org/abs/" in html_url: url = html_url.replace("/abs/", "/pdf/") - return dict(pdf_url=url, technique='arxiv-url') + return dict(pdf_url=url, technique="arxiv-url") # american archivist (OA) # https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630 @@ -202,28 +212,28 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # use a more aggressive direct guess to avoid rate-limiting... if "/doi/10." in html_url: url = html_url.replace("/doi/10.", "/doi/pdf/10.") - return dict(pdf_url=url, technique='archivist-url') + return dict(pdf_url=url, technique="archivist-url") # <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank"> - hrefs = soup.find_all('a', attrs={"target": "_blank"}) + hrefs = soup.find_all("a", attrs={"target": "_blank"}) for href in hrefs: - url = href['href'].strip() + url = href["href"].strip() if "/doi/pdf/" in url: - if url.startswith('http'): - return dict(pdf_url=url, technique='publisher-href') - elif url.startswith('/'): - return dict(pdf_url=host_prefix + url, technique='publisher-href') + if url.startswith("http"): + return dict(pdf_url=url, technique="publisher-href") + elif url.startswith("/"): + return dict(pdf_url=host_prefix + url, technique="publisher-href") # protocols.io # https://www.protocols.io/view/flow-cytometry-protocol-mgdc3s6 if "://www.protocols.io/view/" in html_url and not html_url.endswith(".pdf"): url = html_url + ".pdf" - return dict(pdf_url=url, technique='protocolsio-url') + return dict(pdf_url=url, technique="protocolsio-url") # degruyter.com # https://www.degruyter.com/view/books/9783486594621/9783486594621-009/9783486594621-009.xml if "://www.degruyter.com/view/" in html_url and html_url.endswith(".xml"): - url = html_url.replace('/view/', '/downloadpdf/').replace('.xml', '.pdf') - return dict(pdf_url=url, technique='degruyter-url') + url = html_url.replace("/view/", "/downloadpdf/").replace(".xml", ".pdf") + return dict(pdf_url=url, technique="degruyter-url") # journals.lww.com (Wolters Kluwer) # https://journals.lww.com/spinejournal/Abstract/publishahead/Making_the_Most_of_Systematic_Reviews_and.94318.aspx @@ -231,31 +241,32 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # we never get the content. if "://journals.lww.com/" in html_url and False: # data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw==" - for line in html_body.split(b'\n'): + for line in html_body.split(b"\n"): if b"data-pdf-url=" in line: - line = line.decode('utf-8') - url = line.strip().replace('data-pdf-url=', '').replace('"', '') - if url.startswith('http') and 'pdfs.journals.lww.com' in url: - return dict(pdf_url=url, technique='journals.lww.com-jsvar') + line = line.decode("utf-8") + url = line.strip().replace("data-pdf-url=", "").replace('"', "") + if url.startswith("http") and "pdfs.journals.lww.com" in url: + return dict(pdf_url=url, technique="journals.lww.com-jsvar") # www.ahajournals.org # https://www.ahajournals.org/doi/10.1161/circ.110.19.2977 - if "://www.ahajournals.org/doi/" in html_url and '/doi/pdf/' not in html_url: + if "://www.ahajournals.org/doi/" in html_url and "/doi/pdf/" not in html_url: # <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a> - if b'/doi/pdf/10.' in html_body: - url = html_url.replace('/doi/10.', '/doi/pdf/10.') + if b"/doi/pdf/10." in html_body: + url = html_url.replace("/doi/10.", "/doi/pdf/10.") url = url + "?download=true" - return dict(pdf_url=url, technique='ahajournals-url') + return dict(pdf_url=url, technique="ahajournals-url") # ehp.niehs.nih.gov # https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709 # https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51 if "://ehp.niehs.nih.gov/doi/" in html_url: # <a href="/doi/pdf/10.1289/EHP4709" target="_blank"> - if b'/doi/pdf/10.' in html_body: - url = html_url.replace('/doi/full/10.', - '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.') - return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url') + if b"/doi/pdf/10." in html_body: + url = html_url.replace("/doi/full/10.", "/doi/pdf/10.").replace( + "/doi/10.", "/doi/pdf/10." + ) + return dict(pdf_url=url, technique="ehp.niehs.nigh.gov-url") # cogentoa.com # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873 @@ -263,75 +274,75 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # blech, it's a SPA! All JS # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf url = html_url + ".pdf" - return dict(pdf_url=url, technique='cogentoa-url') + return dict(pdf_url=url, technique="cogentoa-url") # chemrxiv.org (likely to be other figshare domains also) # https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419 - if "://chemrxiv.org/articles/" in html_url or '.figshare.org/articles/' in html_url: + if "://chemrxiv.org/articles/" in html_url or ".figshare.org/articles/" in html_url: # <script id="app-data" type="text/json"> [...] </script> - json_tag = soup.find('script', id="app-data", attrs={"type": "text/json"}) + json_tag = soup.find("script", id="app-data", attrs={"type": "text/json"}) if json_tag and json_tag.string: app_data = json.loads(json_tag.string) # "exportPdfDownloadUrl": "https://s3-eu-west-1.amazonaws.com/itempdf74155353254prod/10101419/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives__The_Hidden_Nature_of_Dasatinib_v1.pdf" - url = app_data.get('article', {}).get('exportPdfDownloadUrl') - if url and url.startswith('http'): - return dict(pdf_url=url, technique='figshare-json') + url = app_data.get("article", {}).get("exportPdfDownloadUrl") + if url and url.startswith("http"): + return dict(pdf_url=url, technique="figshare-json") # CNKI COVID-19 landing pages # http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ - if '://en.gzbd.cnki.net/KCMS/detail/detail.aspx' in html_url: + if "://en.gzbd.cnki.net/KCMS/detail/detail.aspx" in html_url: # <a onclick="WriteKrsDownLog()" target="_blank" id="pdfDown" name="pdfDown" href="/gzbt/download.aspx?filename=4Q1ZYpFdKFUZ6FDR1QkRrolayRXV2ZzattyQ3QFa2JXTyZXUSV3QRFkbndzaGV2KyJXWZVEbFdVYnZndD9EOxg1Tj5Eeys2SMFzLZ5kcuFkM3dEbsR2ZjxEaShVdJhFdp90KhlVVzcjVVlXUVNHWBtWS5Rlb5cnc&tablename=GZBJLAST2020&dflag=pdfdown
 "><i></i>PDF Download</a> - href = soup.find('a', attrs={"id": "pdfDown"}) + href = soup.find("a", attrs={"id": "pdfDown"}) if href: - url = href['href'].strip().replace('
', '') - if not url.startswith('http'): + url = href["href"].strip().replace("
", "") + if not url.startswith("http"): url = host_prefix + url - return dict(pdf_url=url, technique='cnki-href') + return dict(pdf_url=url, technique="cnki-href") # RWTH AACHEN repository - if '://publications.rwth-aachen.de/record/' in html_url: - record_id = html_url.split('/')[-1] + if "://publications.rwth-aachen.de/record/" in html_url: + record_id = html_url.split("/")[-1] url = f"{html_url}/files/{record_id}.pdf" - if record_id.isdigit() and url.encode('utf-8') in html_body: - return dict(pdf_url=url, technique='rwth-aachen-url') + if record_id.isdigit() and url.encode("utf-8") in html_body: + return dict(pdf_url=url, technique="rwth-aachen-url") # physchemaspects.ru - if '://physchemaspects.ru/' in html_url and soup: - for href in soup.find_all('a'): + if "://physchemaspects.ru/" in html_url and soup: + for href in soup.find_all("a"): if href.text == "download PDF file": - url = href['href'] - if url.startswith('/'): + url = href["href"] + if url.startswith("/"): url = host_prefix + url - return dict(pdf_url=url, technique='physchemaspects-href') + return dict(pdf_url=url, technique="physchemaspects-href") # OJS 3 (some) if meta_generator and meta_generator.startswith("Open Journal Systems"): - href = soup.find('a', attrs={"class": "obj_galley_link file"}) + href = soup.find("a", attrs={"class": "obj_galley_link file"}) if href and href.text and "pdf" in href.text.lower(): - url = href['href'].strip() - if url.startswith('/'): + url = href["href"].strip() + if url.startswith("/"): url = host_prefix + url - return dict(pdf_url=url, technique='ojs-galley-href') + return dict(pdf_url=url, technique="ojs-galley-href") # ETH zurich e-periodica - if '://www.e-periodica.ch/digbib/view' in html_url: - url = html_url.replace('digbib/view', 'cntmng').split('#')[0] - if url.encode('utf-8') in html_body: - return dict(pdf_url=url, technique='href-eperiodica') + if "://www.e-periodica.ch/digbib/view" in html_url: + url = html_url.replace("digbib/view", "cntmng").split("#")[0] + if url.encode("utf-8") in html_body: + return dict(pdf_url=url, technique="href-eperiodica") # JMIR # https://mhealth.jmir.org/2020/7/e17891/ - if '.jmir.org/' in html_url and "/pdf" not in html_url and html_url.endswith("/"): + if ".jmir.org/" in html_url and "/pdf" not in html_url and html_url.endswith("/"): url = html_url + "pdf" - return dict(pdf_url=url, technique='jmir-url') + return dict(pdf_url=url, technique="jmir-url") ### below here we are doing guesses # generic guess: try current URL plus .pdf, if it exists in the HTML body - if '.pdf' not in html_url: + if ".pdf" not in html_url: url = html_url + ".pdf" - if url.encode('utf-8') in html_body: - return dict(pdf_url=url, technique='guess-url-plus-pdf') + if url.encode("utf-8") in html_body: + return dict(pdf_url=url, technique="guess-url-plus-pdf") return dict() @@ -343,8 +354,10 @@ def test_regex() -> None: asdf""" m = OVID_JOURNAL_URL_REGEX.search(lines) assert m - assert m.group( - 1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689" + assert ( + m.group(1) + == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689" + ) lines = """ window.onload = function () { diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index e2e673f..1ab667c 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -30,7 +30,9 @@ HEAD_META_PATTERNS: Dict[str, List[str]] = { "meta[name='dcterms.title']", "meta[name='dc.title']", ], - "subtitle": ["meta[name='prism.subtitle']", ], + "subtitle": [ + "meta[name='prism.subtitle']", + ], "doi": [ "meta[name='citation_doi']", "meta[name='DOI']", @@ -40,7 +42,9 @@ HEAD_META_PATTERNS: Dict[str, List[str]] = { "meta[name='dc.identifier.doi']", "meta[name='dc.identifier'][scheme='doi']", ], - "pmid": ["meta[name='citation_pmid']", ], + "pmid": [ + "meta[name='citation_pmid']", + ], "abstract": [ "meta[name='citation_abstract']", "meta[name='bepress_citation_abstract']", @@ -61,7 +65,9 @@ HEAD_META_PATTERNS: Dict[str, List[str]] = { "meta[name='dc.source']", "meta[property='og:site_name']", ], - "container_abbrev": ["meta[name='citation_journal_abbrev']", ], + "container_abbrev": [ + "meta[name='citation_journal_abbrev']", + ], "raw_date": [ "meta[name='citation_publication_date']", "meta[name='bepress_citation_publication_date']", @@ -162,7 +168,9 @@ HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = { "meta[name='dc.contributor']", ], # TODO: citation_author_institution - "raw_references": ["meta[name='citation_reference']", ], + "raw_references": [ + "meta[name='citation_reference']", + ], "raw_identifiers": [ "meta[name='eprints.id_number']", "meta[name='dcterms.identifier']", @@ -646,8 +654,9 @@ class BiblioMetadata(pydantic.BaseModel): json_encoders = {datetime.date: lambda dt: dt.isoformat()} -def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, - patterns: List[dict]) -> Optional[Tuple[str, str]]: +def html_extract_fulltext_url( + doc_url: str, doc: HTMLParser, patterns: List[dict] +) -> Optional[Tuple[str, str]]: """ Tries to quickly extract fulltext URLs using a set of patterns. This function is intendend to be generic across various extraction techniques. @@ -656,36 +665,36 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, """ self_doc_url: Optional[Tuple[str, str]] = None for pattern in patterns: - if 'selector' not in pattern: + if "selector" not in pattern: continue - if 'in_doc_url' in pattern: - if pattern['in_doc_url'] not in doc_url: + if "in_doc_url" in pattern: + if pattern["in_doc_url"] not in doc_url: continue - elem = doc.css_first(pattern['selector']) + elem = doc.css_first(pattern["selector"]) if not elem: continue val = None - if 'attr' in pattern: - val = elem.attrs.get(pattern['attr']) - elif pattern.get('use_body'): + if "attr" in pattern: + val = elem.attrs.get(pattern["attr"]) + elif pattern.get("use_body"): val = elem.text() - if '://' not in val: + if "://" not in val: continue if not val: continue val = urllib.parse.urljoin(doc_url, val) assert val - if 'in_fulltext_url' in pattern: - if pattern['in_fulltext_url'] not in val: + if "in_fulltext_url" in pattern: + if pattern["in_fulltext_url"] not in val: continue for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP: if skip_pattern in val.lower(): continue if url_fuzzy_equal(doc_url, val): # don't link to self, unless no other options - self_doc_url = (val, pattern.get('technique', 'unknown')) + self_doc_url = (val, pattern.get("technique", "unknown")) continue - return (val, pattern.get('technique', 'unknown')) + return (val, pattern.get("technique", "unknown")) if self_doc_url: print(" WARN: returning fulltext URL pointing to self", file=sys.stderr) return self_doc_url @@ -703,9 +712,9 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat for field, patterns in HEAD_META_PATTERNS.items(): for pattern in patterns: val = head.css_first(pattern) - #print((field, pattern, val)) - if val and 'content' in val.attrs and val.attrs['content']: - meta[field] = val.attrs['content'] + # print((field, pattern, val)) + if val and "content" in val.attrs and val.attrs["content"]: + meta[field] = val.attrs["content"] break for field, patterns in HEAD_META_LIST_PATTERNS.items(): @@ -713,53 +722,53 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat val_list = head.css(pattern) if val_list: for val in val_list: - if 'content' in val.attrs and val.attrs['content']: + if "content" in val.attrs and val.attrs["content"]: if field not in meta: meta[field] = [] - meta[field].append(val.attrs['content']) + meta[field].append(val.attrs["content"]) break # (some) fulltext extractions pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS) if pdf_fulltext_url: - meta['pdf_fulltext_url'] = pdf_fulltext_url[0] + meta["pdf_fulltext_url"] = pdf_fulltext_url[0] xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS) if xml_fulltext_url: - meta['xml_fulltext_url'] = xml_fulltext_url[0] + meta["xml_fulltext_url"] = xml_fulltext_url[0] html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS) if html_fulltext_url: - meta['html_fulltext_url'] = html_fulltext_url[0] + meta["html_fulltext_url"] = html_fulltext_url[0] component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS) if component_url: - meta['component_url'] = component_url[0] + meta["component_url"] = component_url[0] # TODO: replace with clean_doi() et al - if meta.get('doi') and meta.get('doi').startswith('doi:'): - meta['doi'] = meta['doi'][4:] + if meta.get("doi") and meta.get("doi").startswith("doi:"): + meta["doi"] = meta["doi"][4:] - raw_identifiers = meta.pop('raw_identifiers', []) + raw_identifiers = meta.pop("raw_identifiers", []) for ident in raw_identifiers: - if ident.startswith('doi:10.'): - if 'doi' not in meta: - meta['doi'] = ident.replace('doi:', '') - elif ident.startswith('10.') and '/' in ident: - if 'doi' not in meta: - meta['doi'] = ident - elif ident.startswith('isbn:'): - if 'isbn' not in meta: - meta['isbn'] = ident.replace('isbn:', '') - - raw_date = meta.pop('raw_date', None) + if ident.startswith("doi:10."): + if "doi" not in meta: + meta["doi"] = ident.replace("doi:", "") + elif ident.startswith("10.") and "/" in ident: + if "doi" not in meta: + meta["doi"] = ident + elif ident.startswith("isbn:"): + if "isbn" not in meta: + meta["isbn"] = ident.replace("isbn:", "") + + raw_date = meta.pop("raw_date", None) if raw_date: parsed = dateparser.parse(raw_date) if parsed: - meta['release_date'] = parsed.date() + meta["release_date"] = parsed.date() - raw_release_type = meta.pop('raw_release_type', None) + raw_release_type = meta.pop("raw_release_type", None) if raw_release_type: release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip()) if release_type: - meta['release_type'] = release_type + meta["release_type"] = release_type return BiblioMetadata(**meta) @@ -786,29 +795,26 @@ def load_adblock_rules() -> braveblock.Adblocker: "||pbs.twimg.com^", "||badge.dimensions.ai^", "||recaptcha.net^", - # not sure about these CC badges (usually via a redirect) - #"||licensebuttons.net^", - #"||i.creativecommons.org^", - + # "||licensebuttons.net^", + # "||i.creativecommons.org^", # Should we skip jquery, or other generic javascript CDNs? - #"||code.jquery.com^", - #"||ajax.googleapis.com^", - #"||cdnjs.cloudflare.com^", - + # "||code.jquery.com^", + # "||ajax.googleapis.com^", + # "||cdnjs.cloudflare.com^", # badges, "share" buttons, tracking, etc "apis.google.com/js/plusone", "www.google.com/recaptcha/", "js/_getUACode.js" - # PLOS images "/resource/img/icon.*.16.png^", ], ) -def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], - type_name: str) -> List[Dict[str, str]]: +def _extract_generic( + doc: HTMLParser, selector: str, attrs: List[str], type_name: str +) -> List[Dict[str, str]]: resources = [] for node in doc.css(selector): @@ -818,21 +824,22 @@ def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], url = node.attrs.get(attr) # special-case a couple meta URI prefixes which don't match with adblock rules skip = False - for prefix in ['about:', 'data:', 'magnet:', 'urn:', 'mailto:']: + for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:"]: if url and url.startswith(prefix): skip = True break if skip: continue if url: - #print(url, file=sys.stderr) + # print(url, file=sys.stderr) resources.append(dict(url=url.strip(), type=type_name)) return resources -def html_extract_resources(doc_url: str, doc: HTMLParser, - adblock: braveblock.Adblocker) -> List[Dict[str, str]]: +def html_extract_resources( + doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker +) -> List[Dict[str, str]]: """ This function tries to find all the important resources in a page. The presumption is that the HTML document is article fulltext, and we want the @@ -860,12 +867,14 @@ def html_extract_resources(doc_url: str, doc: HTMLParser, # ensure URLs are absolute for r in resources: - r['url'] = urllib.parse.urljoin(doc_url, r['url']) + r["url"] = urllib.parse.urljoin(doc_url, r["url"]) # filter using adblocker resources = [ - r for r in resources if adblock.check_network_urls( - r['url'], source_url=doc_url, request_type=r['type']) is False + r + for r in resources + if adblock.check_network_urls(r["url"], source_url=doc_url, request_type=r["type"]) + is False ] # remove duplicates diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 8f28d42..99a7f36 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -34,50 +34,63 @@ class SandcrawlerBackoffError(Exception): be passed up through any timeout/retry code and become an actual long pause or crash. """ + pass -ResourceResult = namedtuple("ResourceResult", [ - "start_url", - "hit", - "status", - "terminal_url", - "terminal_dt", - "terminal_status_code", - "body", - "cdx", - "revisit_cdx", -]) - -WarcResource = namedtuple("WarcResource", [ - "status_code", - "location", - "body", - "revisit_cdx", -]) - -CdxRow = namedtuple('CdxRow', [ - 'surt', - 'datetime', - 'url', - 'mimetype', - 'status_code', - 'sha1b32', - 'sha1hex', - 'warc_csize', - 'warc_offset', - 'warc_path', -]) - -CdxPartial = namedtuple('CdxPartial', [ - 'surt', - 'datetime', - 'url', - 'mimetype', - 'status_code', - 'sha1b32', - 'sha1hex', -]) +ResourceResult = namedtuple( + "ResourceResult", + [ + "start_url", + "hit", + "status", + "terminal_url", + "terminal_dt", + "terminal_status_code", + "body", + "cdx", + "revisit_cdx", + ], +) + +WarcResource = namedtuple( + "WarcResource", + [ + "status_code", + "location", + "body", + "revisit_cdx", + ], +) + +CdxRow = namedtuple( + "CdxRow", + [ + "surt", + "datetime", + "url", + "mimetype", + "status_code", + "sha1b32", + "sha1hex", + "warc_csize", + "warc_offset", + "warc_path", + ], +) + +CdxPartial = namedtuple( + "CdxPartial", + [ + "surt", + "datetime", + "url", + "mimetype", + "status_code", + "sha1b32", + "sha1hex", + ], +) def cdx_partial_from_row(row: Union[CdxRow, CdxPartial]) -> CdxPartial: @@ -102,10 +115,10 @@ def cdx_to_dict(cdx: Union[CdxRow, CdxPartial]) -> Dict[str, Any]: "sha1b32": cdx.sha1b32, "sha1hex": cdx.sha1hex, } - if type(cdx) == CdxRow and '/' in cdx.warc_path: - d['warc_csize'] = cdx.warc_csize - d['warc_offset'] = cdx.warc_offset - d['warc_path'] = cdx.warc_path + if type(cdx) == CdxRow and "/" in cdx.warc_path: + d["warc_csize"] = cdx.warc_csize + d["warc_offset"] = cdx.warc_offset + d["warc_path"] = cdx.warc_path return d @@ -116,9 +129,9 @@ def fuzzy_match_url(left: str, right: str) -> bool: """ if left == right: return True - if '://' in left and '://' in right: - left = '://'.join(left.split('://')[1:]) - right = '://'.join(right.split('://')[1:]) + if "://" in left and "://" in right: + left = "://".join(left.split("://")[1:]) + right = "://".join(right.split("://")[1:]) if left == right: return True if left == right + "/" or right == left + "/": @@ -149,14 +162,17 @@ class CdxApiClient: def __init__(self, host_url: str = "https://web.archive.org/cdx/search/cdx", **kwargs): self.host_url = host_url self.http_session = requests_retry_session(retries=3, backoff_factor=3) - cdx_auth_token = kwargs.get('cdx_auth_token', os.environ.get('CDX_AUTH_TOKEN')) + cdx_auth_token = kwargs.get("cdx_auth_token", os.environ.get("CDX_AUTH_TOKEN")) if not cdx_auth_token: raise Exception( - "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)") - self.http_session.headers.update({ - 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient', - 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token), - }) + "CDX auth token required (as parameter or environment variable CDX_AUTH_TOKEN)" + ) + self.http_session.headers.update( + { + "User-Agent": "Mozilla/5.0 sandcrawler.CdxApiClient", + "Cookie": "cdx_auth_token={}".format(cdx_auth_token), + } + ) def _query_api(self, params: Dict[str, str]) -> Optional[List[CdxRow]]: """ @@ -165,7 +181,7 @@ class CdxApiClient: resp = self.http_session.get(self.host_url, params=params) if resp.status_code != 200: raise CdxApiError(resp.text) - #print(resp.url, file=sys.stderr) + # print(resp.url, file=sys.stderr) if not resp.text: return None rj = resp.json() @@ -187,7 +203,7 @@ class CdxApiClient: status_code = int(raw[4]) # CDX rows with no WARC records? - if raw[8] == '-' or raw[9] == '-' or raw[10] == '-': + if raw[8] == "-" or raw[9] == "-" or raw[10] == "-": continue row = CdxRow( @@ -206,28 +222,31 @@ class CdxApiClient: rows.append(row) return rows - def fetch(self, - url: str, - datetime: str, - filter_status_code: Optional[int] = None, - retry_sleep: Optional[int] = None) -> CdxRow: + def fetch( + self, + url: str, + datetime: str, + filter_status_code: Optional[int] = None, + retry_sleep: Optional[int] = None, + ) -> CdxRow: """ Fetches a single CDX row by url/datetime. Raises a KeyError if not found, because we expect to be looking up a specific full record. """ if len(datetime) != 14: raise ValueError( - "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)) + "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime) + ) params: Dict[str, str] = { - 'url': url, - 'from': datetime, - 'to': datetime, - 'matchType': 'exact', - 'limit': "1", - 'output': 'json', + "url": url, + "from": datetime, + "to": datetime, + "matchType": "exact", + "limit": "1", + "output": "json", } if filter_status_code: - params['filter'] = "statuscode:{}".format(filter_status_code) + params["filter"] = "statuscode:{}".format(filter_status_code) resp = self._query_api(params) if not resp: if retry_sleep and retry_sleep > 0: @@ -235,37 +254,43 @@ class CdxApiClient: if retry_sleep > 3: next_sleep = retry_sleep - 3 retry_sleep = 3 - print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), - file=sys.stderr) + print( + " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), + file=sys.stderr, + ) time.sleep(retry_sleep) - return self.fetch(url, - datetime, - filter_status_code=filter_status_code, - retry_sleep=next_sleep) + return self.fetch( + url, datetime, filter_status_code=filter_status_code, retry_sleep=next_sleep + ) raise KeyError("CDX url/datetime not found: {} {}".format(url, datetime)) row = resp[0] # allow fuzzy http/https match if not (fuzzy_match_url(row.url, url) and row.datetime == datetime): if retry_sleep and retry_sleep > 0: - print(" CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), - file=sys.stderr) + print( + " CDX fetch failed; will sleep {}sec and try again".format(retry_sleep), + file=sys.stderr, + ) time.sleep(retry_sleep) - return self.fetch(url, - datetime, - filter_status_code=filter_status_code, - retry_sleep=None) + return self.fetch( + url, datetime, filter_status_code=filter_status_code, retry_sleep=None + ) raise KeyError( "Didn't get exact CDX url/datetime match. url:{} dt:{} got:{}".format( - url, datetime, row)) + url, datetime, row + ) + ) if filter_status_code: assert row.status_code == filter_status_code return row - def lookup_best(self, - url: str, - max_age_days: Optional[int] = None, - best_mimetype: Optional[str] = None, - closest: Union[datetime.datetime, str, None] = None) -> Optional[CdxRow]: + def lookup_best( + self, + url: str, + max_age_days: Optional[int] = None, + best_mimetype: Optional[str] = None, + closest: Union[datetime.datetime, str, None] = None, + ) -> Optional[CdxRow]: """ Fetches multiple CDX rows for the given URL, tries to find the most recent. @@ -289,27 +314,26 @@ class CdxApiClient: """ params: Dict[str, str] = { - 'url': url, - 'matchType': 'exact', - 'limit': "-25", - 'output': 'json', + "url": url, + "matchType": "exact", + "limit": "-25", + "output": "json", # Collapsing seems efficient, but is complex; would need to include # other filters and status code in filter #'collapse': 'timestamp:6', - # Revisits now allowed and resolved! #'filter': '!mimetype:warc/revisit', } if max_age_days: since = datetime.date.today() - datetime.timedelta(days=max_age_days) - params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day) + params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day) if closest: if isinstance(closest, datetime.datetime): - params['closest'] = '%04d%02d%02d' % (closest.year, closest.month, closest.day) + params["closest"] = "%04d%02d%02d" % (closest.year, closest.month, closest.day) else: - params['closest'] = closest - params['sort'] = "closest" - #print(params, file=sys.stderr) + params["closest"] = closest + params["sort"] = "closest" + # print(params, file=sys.stderr) rows = self._query_api(params) if not rows: return None @@ -326,7 +350,7 @@ class CdxApiClient: int(r.mimetype == best_mimetype), int(r.mimetype != "warc/revisit"), int(r.datetime[:6]), - int('/' in r.warc_path), + int("/" in r.warc_path), int(r.datetime), ) @@ -358,25 +382,23 @@ class WaybackClient: self.cdx_client = CdxApiClient() # /serve/ instead of /download/ doesn't record view count # this *does* want to be http://, not https:// - self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/") # gwb library will fall back to reading from /opt/.petabox/webdata.secret self.petabox_webdata_secret = kwargs.get( - 'petabox_webdata_secret', - os.environ.get('PETABOX_WEBDATA_SECRET'), + "petabox_webdata_secret", + os.environ.get("PETABOX_WEBDATA_SECRET"), ) - self.warc_uri_prefix = kwargs.get('warc_uri_prefix', 'https://archive.org/serve/') + self.warc_uri_prefix = kwargs.get("warc_uri_prefix", "https://archive.org/serve/") self.rstore = None self.max_redirects = 25 self.wayback_endpoint = "https://web.archive.org/web/" self.replay_headers = { - 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient', + "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient", } - def fetch_petabox(self, - csize: int, - offset: int, - warc_path: str, - resolve_revisit: bool = True) -> WarcResource: + def fetch_petabox( + self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True + ) -> WarcResource: """ Fetches wayback resource directly from petabox using WARC path/offset/csize. @@ -401,37 +423,49 @@ class WaybackClient: raise Exception("WaybackClient needs petabox secret to do direct WARC fetches") if "/" not in warc_path: raise ValueError( - "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)) + "what looks like a liveweb/SPN temporary warc path: {}".format(warc_path) + ) warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: self.rstore = ResourceStore( - loaderfactory=CDXLoaderFactory3(webdata_secret=self.petabox_webdata_secret, )) + loaderfactory=CDXLoaderFactory3( + webdata_secret=self.petabox_webdata_secret, + ) + ) assert self.rstore try: - #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr) + # print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr) gwb_record = self.rstore.load_resource(warc_uri, offset, csize) except wayback.exception.ResourceUnavailable: print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr) raise PetaboxError( - "failed to load file contents from wayback/petabox (ResourceUnavailable)") + "failed to load file contents from wayback/petabox (ResourceUnavailable)" + ) except wayback.exception.InvalidResource: print(" Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr) raise WaybackContentError( - "failed to load file contents from wayback/petabox (InvalidResource)") + "failed to load file contents from wayback/petabox (InvalidResource)" + ) except urllib3.exceptions.ReadTimeoutError as rte: raise PetaboxError( - "failed to load file contents from wayback/petabox (ReadTimeoutError: {})". - format(rte)) + "failed to load file contents from wayback/petabox (ReadTimeoutError: {})".format( + rte + ) + ) except ValueError as ve: raise PetaboxError( - "failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + "failed to load file contents from wayback/petabox (ValueError: {})".format(ve) + ) except EOFError as eofe: raise PetaboxError( - "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + "failed to load file contents from wayback/petabox (EOFError: {})".format(eofe) + ) except TypeError as te: raise PetaboxError( - "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)" - .format(te)) + "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format( + te + ) + ) except Exception as e: if "while decompressing data: invalid block type" in str(e): raise PetaboxError( @@ -449,8 +483,11 @@ class WaybackClient: raise WaybackContentError("too many HTTP headers (in wayback fetch)") location = gwb_record.get_location() or None - if status_code is None and gwb_record.target_uri.startswith( - b"ftp://") and not gwb_record.is_revisit(): + if ( + status_code is None + and gwb_record.target_uri.startswith(b"ftp://") + and not gwb_record.is_revisit() + ): # TODO: some additional verification here? status_code = 226 @@ -463,17 +500,19 @@ class WaybackClient: if not (revisit_uri and revisit_dt): raise WaybackContentError( "revisit record missing URI and/or DT: warc:{} offset:{}".format( - warc_path, offset)) + warc_path, offset + ) + ) # convert revisit_dt # len("2018-07-24T11:56:49"), or with "Z" assert len(revisit_dt) in (19, 20) if type(revisit_uri) is bytes: - revisit_uri = revisit_uri.decode('utf-8') + revisit_uri = revisit_uri.decode("utf-8") if type(revisit_dt) is bytes: - revisit_dt = revisit_dt.decode('utf-8') - revisit_dt = revisit_dt.replace('-', '').replace(':', - '').replace('T', - '').replace('Z', '') + revisit_dt = revisit_dt.decode("utf-8") + revisit_dt = ( + revisit_dt.replace("-", "").replace(":", "").replace("T", "").replace("Z", "") + ) assert len(revisit_dt) == 14 try: revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt) @@ -491,8 +530,10 @@ class WaybackClient: body = gwb_record.open_raw_content().read() except IncompleteRead as ire: raise WaybackError( - "failed to read actual file contents from wayback/petabox (IncompleteRead: {})" - .format(ire)) + "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format( + ire + ) + ) elif status_code is None: raise WaybackContentError("got a None status_code in (W)ARC record") return WarcResource( @@ -502,12 +543,14 @@ class WaybackClient: revisit_cdx=revisit_cdx, ) - def fetch_petabox_body(self, - csize: int, - offset: int, - warc_path: str, - resolve_revisit: bool = True, - expected_status_code: Optional[int] = None) -> bytes: + def fetch_petabox_body( + self, + csize: int, + offset: int, + warc_path: str, + resolve_revisit: bool = True, + expected_status_code: Optional[int] = None, + ) -> bytes: """ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize. @@ -524,20 +567,22 @@ class WaybackClient: if expected_status_code: if expected_status_code != resource.status_code: - raise KeyError("archived HTTP response (WARC) was not {}: {}".format( - expected_status_code, - resource.status_code, - )) + raise KeyError( + "archived HTTP response (WARC) was not {}: {}".format( + expected_status_code, + resource.status_code, + ) + ) elif resource.status_code not in (200, 226): - raise KeyError("archived HTTP response (WARC) was not 200: {}".format( - resource.status_code)) + raise KeyError( + "archived HTTP response (WARC) was not 200: {}".format(resource.status_code) + ) return resource.body - def fetch_replay_body(self, - url: str, - datetime: str, - cdx_sha1hex: Optional[str] = None) -> bytes: + def fetch_replay_body( + self, url: str, datetime: str, cdx_sha1hex: Optional[str] = None + ) -> bytes: """ Fetches an HTTP 200 record from wayback via the replay interface (web.archive.org) instead of petabox. @@ -570,32 +615,42 @@ class WaybackClient: except UnicodeDecodeError: raise WaybackContentError( "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format( - url)) + url + ) + ) try: resp.raise_for_status() except Exception as e: raise WaybackError(str(e)) - #print(resp.url, file=sys.stderr) + # print(resp.url, file=sys.stderr) # defensively check that this is actually correct replay based on headers if "X-Archive-Src" not in resp.headers: raise WaybackError("replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: - raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format( - datetime, resp.url)) + raise WaybackError( + "didn't get exact reply (redirect?) datetime:{} got:{}".format( + datetime, resp.url + ) + ) if cdx_sha1hex: # verify that body matches CDX hash # TODO: don't need *all* these hashes, just sha1 file_meta = gen_file_metadata(resp.content) - if cdx_sha1hex != file_meta['sha1hex']: - print(" REPLAY MISMATCH: cdx:{} replay:{}".format(cdx_sha1hex, - file_meta['sha1hex']), - file=sys.stderr) + if cdx_sha1hex != file_meta["sha1hex"]: + print( + " REPLAY MISMATCH: cdx:{} replay:{}".format( + cdx_sha1hex, file_meta["sha1hex"] + ), + file=sys.stderr, + ) raise WaybackContentError( "replay fetch body didn't match CDX hash cdx:{} body:{}".format( - cdx_sha1hex, file_meta['sha1hex']), ) + cdx_sha1hex, file_meta["sha1hex"] + ), + ) return resp.content def fetch_replay_redirect(self, url: str, datetime: str) -> Optional[str]: @@ -625,37 +680,44 @@ class WaybackClient: except UnicodeDecodeError: raise WaybackContentError( "UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format( - url)) + url + ) + ) try: resp.raise_for_status() except Exception as e: raise WaybackError(str(e)) - #print(resp.url, file=sys.stderr) + # print(resp.url, file=sys.stderr) # defensively check that this is actually correct replay based on headers # previously check for "X-Archive-Redirect-Reason" here if "X-Archive-Src" not in resp.headers: raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: - raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format( - datetime, resp.url)) + raise WaybackError( + "didn't get exact reply (redirect?) datetime:{} got:{}".format( + datetime, resp.url + ) + ) redirect_url = resp.headers.get("Location") # eg, https://web.archive.org/web/20200111003923id_/https://dx.doi.org/10.17504/protocols.io.y2gfybw - #print(redirect_url, file=sys.stderr) + # print(redirect_url, file=sys.stderr) if redirect_url and redirect_url.startswith("https://web.archive.org/web/"): redirect_url = "/".join(redirect_url.split("/")[5:]) - #print(redirect_url, file=sys.stderr) + # print(redirect_url, file=sys.stderr) if redirect_url and redirect_url.startswith("http"): redirect_url = clean_url(redirect_url) return redirect_url else: return None - def lookup_resource(self, - start_url: str, - best_mimetype: Optional[str] = None, - closest: Union[str, datetime.datetime, None] = None) -> ResourceResult: + def lookup_resource( + self, + start_url: str, + best_mimetype: Optional[str] = None, + closest: Union[str, datetime.datetime, None] = None, + ) -> ResourceResult: """ Looks in wayback for a resource starting at the URL, following any redirects. Returns a ResourceResult object, which may indicate a @@ -684,8 +746,9 @@ class WaybackClient: for i in range(self.max_redirects + 1): print(" URL: {}".format(next_url), file=sys.stderr) next_row: Optional[CdxRow] = self.cdx_client.lookup_best( - next_url, best_mimetype=best_mimetype, closest=closest) - #print(next_row, file=sys.stderr) + next_url, best_mimetype=best_mimetype, closest=closest + ) + # print(next_row, file=sys.stderr) if not next_row: return ResourceResult( start_url=start_url, @@ -702,7 +765,7 @@ class WaybackClient: cdx_row: CdxRow = next_row # first try straight-forward redirect situation - if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path: + if cdx_row.mimetype == "warc/revisit" and "/" in cdx_row.warc_path: resource = self.fetch_petabox( csize=cdx_row.warc_csize, offset=cdx_row.warc_offset, @@ -725,7 +788,7 @@ class WaybackClient: if cdx_row.status_code in (200, 226): revisit_cdx = None final_cdx: Union[CdxRow, CdxPartial] = cdx_row - if '/' in cdx_row.warc_path: + if "/" in cdx_row.warc_path: resource = self.fetch_petabox( csize=cdx_row.warc_csize, offset=cdx_row.warc_offset, @@ -751,7 +814,7 @@ class WaybackClient: revisit_cdx=revisit_cdx, ) elif 300 <= (cdx_row.status_code or 0) < 400: - if '/' in cdx_row.warc_path: + if "/" in cdx_row.warc_path: resource = self.fetch_petabox( csize=cdx_row.warc_csize, offset=cdx_row.warc_offset, @@ -848,34 +911,39 @@ class SavePageNowBackoffError(SandcrawlerBackoffError): pass -SavePageNowResult = namedtuple('SavePageNowResult', [ - 'success', - 'status', - 'job_id', - 'request_url', - 'terminal_url', - 'terminal_dt', - 'resources', -]) +SavePageNowResult = namedtuple( + "SavePageNowResult", + [ + "success", + "status", + "job_id", + "request_url", + "terminal_url", + "terminal_dt", + "resources", + ], +) class SavePageNowClient: def __init__(self, v2endpoint: str = "https://web.archive.org/save", **kwargs): - self.ia_access_key = kwargs.get('ia_access_key', os.environ.get('IA_ACCESS_KEY')) - self.ia_secret_key = kwargs.get('ia_secret_key', os.environ.get('IA_SECRET_KEY')) + self.ia_access_key = kwargs.get("ia_access_key", os.environ.get("IA_ACCESS_KEY")) + self.ia_secret_key = kwargs.get("ia_secret_key", os.environ.get("IA_SECRET_KEY")) self.v2endpoint = v2endpoint self.v2_session = requests_retry_session(retries=5, backoff_factor=3) - self.v2_session.headers.update({ - 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient', - 'Accept': 'application/json', - 'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key), - }) + self.v2_session.headers.update( + { + "User-Agent": "Mozilla/5.0 sandcrawler.SavePageNowClient", + "Accept": "application/json", + "Authorization": "LOW {}:{}".format(self.ia_access_key, self.ia_secret_key), + } + ) # 3 minutes total self.poll_count = 60 self.poll_seconds = 3.0 - self.spn_cdx_retry_sec = kwargs.get('spn_cdx_retry_sec', 9.0) + self.spn_cdx_retry_sec = kwargs.get("spn_cdx_retry_sec", 9.0) # these are special-case web domains for which we want SPN2 to not run # a headless browser (brozzler), but instead simply run wget. @@ -888,20 +956,20 @@ class SavePageNowClient: "://europepmc.org/backend/ptpmcrender.fcgi", "://pdfs.semanticscholar.org/", "://res.mdpi.com/", - # platform sites "://zenodo.org/", "://figshare.org/", "://springernature.figshare.com/", - # popular simple cloud storage or direct links "://s3-eu-west-1.amazonaws.com/", ] - def save_url_now_v2(self, - request_url: str, - force_simple_get: Optional[int] = None, - capture_outlinks: int = 0) -> SavePageNowResult: + def save_url_now_v2( + self, + request_url: str, + force_simple_get: Optional[int] = None, + capture_outlinks: int = 0, + ) -> SavePageNowResult: """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed at all, or raises an exception if there was an error with SPN itself. @@ -944,33 +1012,39 @@ class SavePageNowClient: resp = self.v2_session.post( self.v2endpoint, data={ - 'url': request_url, - 'capture_all': 1, - 'capture_outlinks': capture_outlinks, - 'capture_screenshot': 0, - 'if_not_archived_within': '1d', - 'force_get': force_simple_get, - 'skip_first_archive': 1, - 'outlinks_availability': 0, - 'js_behavior_timeout': 0, + "url": request_url, + "capture_all": 1, + "capture_outlinks": capture_outlinks, + "capture_screenshot": 0, + "if_not_archived_within": "1d", + "force_get": force_simple_get, + "skip_first_archive": 1, + "outlinks_availability": 0, + "js_behavior_timeout": 0, }, ) if resp.status_code == 429: - raise SavePageNowBackoffError("status_code: {}, url: {}".format( - resp.status_code, request_url)) + raise SavePageNowBackoffError( + "status_code: {}, url: {}".format(resp.status_code, request_url) + ) elif resp.status_code != 200: - raise SavePageNowError("SPN2 status_code: {}, url: {}".format( - resp.status_code, request_url)) + raise SavePageNowError( + "SPN2 status_code: {}, url: {}".format(resp.status_code, request_url) + ) resp_json = resp.json() - if resp_json and 'message' in resp_json and 'You have already reached the limit of active sessions' in resp_json[ - 'message']: - raise SavePageNowBackoffError(resp_json['message']) - elif not resp_json or 'job_id' not in resp_json or not resp_json['job_id']: + if ( + resp_json + and "message" in resp_json + and "You have already reached the limit of active sessions" in resp_json["message"] + ): + raise SavePageNowBackoffError(resp_json["message"]) + elif not resp_json or "job_id" not in resp_json or not resp_json["job_id"]: raise SavePageNowError( - "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json)) + "Didn't get expected 'job_id' field in SPN2 response: {}".format(resp_json) + ) - job_id = resp_json['job_id'] + job_id = resp_json["job_id"] print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr) # poll until complete @@ -981,53 +1055,59 @@ class SavePageNowClient: resp.raise_for_status() except Exception: raise SavePageNowError(resp.content) - status = resp.json()['status'] - if status == 'pending': + status = resp.json()["status"] + if status == "pending": time.sleep(self.poll_seconds) - elif status in ('success', 'error'): + elif status in ("success", "error"): final_json = resp.json() break else: - raise SavePageNowError("Unknown SPN2 status:{} url:{}".format( - status, request_url)) + raise SavePageNowError( + "Unknown SPN2 status:{} url:{}".format(status, request_url) + ) if not final_json: raise SavePageNowError("SPN2 timed out (polling count exceeded)") # if there was a recent crawl of same URL, fetch the status of that # crawl to get correct datetime - if final_json.get('original_job_id'): - print(f" SPN recent capture: {job_id} -> {final_json['original_job_id']}", - file=sys.stderr) - resp = self.v2_session.get("{}/status/{}".format(self.v2endpoint, - final_json['original_job_id'])) + if final_json.get("original_job_id"): + print( + f" SPN recent capture: {job_id} -> {final_json['original_job_id']}", + file=sys.stderr, + ) + resp = self.v2_session.get( + "{}/status/{}".format(self.v2endpoint, final_json["original_job_id"]) + ) try: resp.raise_for_status() except Exception: raise SavePageNowError(resp.content) final_json = resp.json() - #print(final_json, file=sys.stderr) + # print(final_json, file=sys.stderr) - if final_json['status'] == "success": - if final_json.get('original_url').startswith('/'): - print(f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}", - file=sys.stderr) + if final_json["status"] == "success": + if final_json.get("original_url").startswith("/"): + print( + f" truncateded URL in JSON: {request_url} {json.dumps(final_json)}", + file=sys.stderr, + ) return SavePageNowResult( True, "success", job_id, request_url, - final_json['original_url'], - final_json['timestamp'], - final_json['resources'], + final_json["original_url"], + final_json["timestamp"], + final_json["resources"], ) else: - if final_json['status'] == 'pending': - final_json['status'] = 'error:pending' + if final_json["status"] == "pending": + final_json["status"] = "error:pending" return SavePageNowResult( False, - final_json.get('status_ext') or final_json['status'], + final_json.get("status_ext") or final_json["status"], job_id, request_url, None, @@ -1035,10 +1115,12 @@ class SavePageNowClient: None, ) - def crawl_resource(self, - start_url: str, - wayback_client: WaybackClient, - force_simple_get: Optional[int] = None) -> ResourceResult: + def crawl_resource( + self, + start_url: str, + wayback_client: WaybackClient, + force_simple_get: Optional[int] = None, + ) -> ResourceResult: """ Runs a SPN2 crawl, then fetches body. @@ -1048,18 +1130,23 @@ class SavePageNowClient: """ # HACK: capture CNKI domains with outlinks (for COVID-19 crawling) - if 'gzbd.cnki.net/' in start_url: - spn_result = self.save_url_now_v2(start_url, - force_simple_get=force_simple_get, - capture_outlinks=1) + if "gzbd.cnki.net/" in start_url: + spn_result = self.save_url_now_v2( + start_url, force_simple_get=force_simple_get, capture_outlinks=1 + ) else: spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get) if not spn_result.success: status = spn_result.status - if status in ("error:invalid-url", "error:not-found", - "error:invalid-host-resolution", "error:gateway-timeout", - "error:too-many-redirects", "error:read-timeout"): + if status in ( + "error:invalid-url", + "error:not-found", + "error:invalid-host-resolution", + "error:gateway-timeout", + "error:too-many-redirects", + "error:read-timeout", + ): status = status.replace("error:", "") elif status in ("error:no-access", "error:forbidden"): status = "forbidden" @@ -1070,8 +1157,10 @@ class SavePageNowClient: elif status.startswith("error:"): status = "spn2-" + status # despite other errors, call these a failure (so we don't retry) - if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') - or spn_result.terminal_url.endswith("cookieSet=1")): + if spn_result.terminal_url and ( + spn_result.terminal_url.endswith("/cookieAbsent") + or spn_result.terminal_url.endswith("cookieSet=1") + ): status = "blocked-cookie" return ResourceResult( start_url=start_url, @@ -1084,10 +1173,10 @@ class SavePageNowClient: cdx=None, revisit_cdx=None, ) - #print(spn_result, file=sys.stderr) + # print(spn_result, file=sys.stderr) # detect partial URL response (aka, success, but missing full URL) - if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith('/'): + if "://" not in spn_result.terminal_url or spn_result.terminal_url.startswith("/"): return ResourceResult( start_url=start_url, hit=False, @@ -1102,7 +1191,8 @@ class SavePageNowClient: # don't try to CDX fetch for this common cookie block terminal if spn_result.terminal_url.endswith( - '/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"): + "/cookieAbsent" + ) or spn_result.terminal_url.endswith("cookieSet=1"): return ResourceResult( start_url=start_url, hit=False, @@ -1127,7 +1217,7 @@ class SavePageNowClient: cdx_row = elsevier_pdf_cdx else: print(" Failed pdf.sciencedirectassets.com hack!", file=sys.stderr) - #print(elsevier_pdf_cdx, file=sys.stderr) + # print(elsevier_pdf_cdx, file=sys.stderr) if not cdx_row: # lookup exact @@ -1164,11 +1254,11 @@ class SavePageNowClient: revisit_cdx=None, ) - #print(cdx_row, file=sys.stderr) + # print(cdx_row, file=sys.stderr) revisit_cdx = None final_cdx: Union[CdxRow, CdxPartial] = cdx_row - if '/' in cdx_row.warc_path: + if "/" in cdx_row.warc_path: # Usually can't do this kind of direct fetch because CDX result is recent/live resource = wayback_client.fetch_petabox( csize=cdx_row.warc_csize, @@ -1228,12 +1318,19 @@ class SavePageNowClient: ) -def fix_transfer_encoding(file_meta: dict, - resource: ResourceResult) -> Tuple[dict, ResourceResult]: - if resource.body and file_meta[ - 'mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': - print(" transfer encoding not stripped: {}".format(resource.cdx.mimetype), - file=sys.stderr) +def fix_transfer_encoding( + file_meta: dict, resource: ResourceResult +) -> Tuple[dict, ResourceResult]: + if ( + resource.body + and file_meta["mimetype"] == "application/gzip" + and resource.cdx + and resource.cdx.mimetype != "application/gzip" + ): + print( + " transfer encoding not stripped: {}".format(resource.cdx.mimetype), + file=sys.stderr, + ) inner_body = gzip.decompress(resource.body) if not inner_body: raise Exception("null body inside transfer encoding") diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 49c7ddf..4a5abbe 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -10,15 +10,32 @@ from selectolax.parser import HTMLParser from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.grobid import GrobidClient from sandcrawler.html import extract_fulltext_url -from sandcrawler.html_metadata import (html_extract_biblio, html_extract_resources, - load_adblock_rules) -from sandcrawler.ia import (CdxApiError, NoCaptureError, PetaboxError, ResourceResult, - SavePageNowClient, SavePageNowError, WaybackClient, - WaybackContentError, WaybackError, cdx_to_dict, - fix_transfer_encoding) -from sandcrawler.ingest_html import (WebResource, fetch_html_resources, - html_extract_body_teixml, html_guess_platform, - html_guess_scope, quick_fetch_html_resources) +from sandcrawler.html_metadata import ( + html_extract_biblio, + html_extract_resources, + load_adblock_rules, +) +from sandcrawler.ia import ( + CdxApiError, + NoCaptureError, + PetaboxError, + ResourceResult, + SavePageNowClient, + SavePageNowError, + WaybackClient, + WaybackContentError, + WaybackError, + cdx_to_dict, + fix_transfer_encoding, +) +from sandcrawler.ingest_html import ( + WebResource, + fetch_html_resources, + html_extract_body_teixml, + html_guess_platform, + html_guess_scope, + quick_fetch_html_resources, +) from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime from sandcrawler.pdfextract import PdfExtractResult, process_pdf from sandcrawler.workers import SandcrawlerWorker @@ -53,74 +70,71 @@ class IngestFileWorker(SandcrawlerWorker): process_file_hit(ResourceResult) -> response process_grobid(ResourceResult) """ + def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs): super().__init__() self.sink = sink - if kwargs.get('wayback_client'): - self.wayback_client: WaybackClient = kwargs['wayback_client'] + if kwargs.get("wayback_client"): + self.wayback_client: WaybackClient = kwargs["wayback_client"] else: self.wayback_client = WaybackClient() - if kwargs.get('spn_client'): - self.spn_client: SavePageNowClient = kwargs['spn_client'] + if kwargs.get("spn_client"): + self.spn_client: SavePageNowClient = kwargs["spn_client"] else: self.spn_client = SavePageNowClient( - spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) + spn_cdx_retry_sec=kwargs.get("spn_cdx_retry_sec", 9.0) + ) - if kwargs.get('grobid_client'): - self.grobid_client: GrobidClient = kwargs['grobid_client'] + if kwargs.get("grobid_client"): + self.grobid_client: GrobidClient = kwargs["grobid_client"] else: self.grobid_client = GrobidClient() - if kwargs.get('pgrest_client'): - self.pgrest_client: SandcrawlerPostgrestClient = kwargs['pgrest_client'] + if kwargs.get("pgrest_client"): + self.pgrest_client: SandcrawlerPostgrestClient = kwargs["pgrest_client"] else: self.pgrest_client = SandcrawlerPostgrestClient() - self.grobid_sink = kwargs.get('grobid_sink') - self.thumbnail_sink = kwargs.get('thumbnail_sink') - self.pdftext_sink = kwargs.get('pdftext_sink') - self.xmldoc_sink = kwargs.get('xmldoc_sink') - self.htmlteixml_sink = kwargs.get('htmlteixml_sink') + self.grobid_sink = kwargs.get("grobid_sink") + self.thumbnail_sink = kwargs.get("thumbnail_sink") + self.pdftext_sink = kwargs.get("pdftext_sink") + self.xmldoc_sink = kwargs.get("xmldoc_sink") + self.htmlteixml_sink = kwargs.get("htmlteixml_sink") self.max_hops = 6 - self.try_existing_ingest = kwargs.get('try_existing_ingest', False) - self.try_existing_grobid = kwargs.get('try_existing_grobid', True) - self.try_existing_pdfextract = kwargs.get('try_existing_pdfextract', True) - self.try_wayback = kwargs.get('try_wayback', True) - self.try_spn2 = kwargs.get('try_spn2', True) - self.html_quick_mode = kwargs.get('html_quick_mode', False) + self.try_existing_ingest = kwargs.get("try_existing_ingest", False) + self.try_existing_grobid = kwargs.get("try_existing_grobid", True) + self.try_existing_pdfextract = kwargs.get("try_existing_pdfextract", True) + self.try_wayback = kwargs.get("try_wayback", True) + self.try_spn2 = kwargs.get("try_spn2", True) + self.html_quick_mode = kwargs.get("html_quick_mode", False) self.adblock_rules = load_adblock_rules() self.max_html_resources = 200 self.base_url_blocklist = [ # robot blocking "://hkvalidate.perfdrive.com/", - # temporary, until we implement specific fetch and 'petabox' output "://archive.org/", "://www.archive.org/", "://web.archive.org/web/", - # out of scope "://openlibrary.org/", "://www.openlibrary.org/", "://fatcat.wiki/", "://orcid.org/", "://doaj.org/", - # Domain squats "://bartandjones.com", "://ijretm.com", "://ijrcemas.com", "://jist.net.in", "://croisements-revue.org", - # all stubs/previews, not full papers "://page-one.live.cf.public.springer.com", - # large datasets-only (no PDF expected) "plutof.ut.ee/", "www.gbif.org/", @@ -129,16 +143,13 @@ class IngestFileWorker(SandcrawlerWorker): "://doi.org/10.25642/ipk/gbis/", "://apex.ipk-gatersleben.de/", "fao.org/glis/", - # Historical non-paper content: "dhz.uni-passau.de/", # newspapers "digital.ucd.ie/", # ireland national historical - # DOI prefixes "doi.org/10.2307/", # JSTOR; slow and many redirects "doi.org/10.18730/", # fao.org: database entry "doi.org/10.15468/", # gbif.org: database entry - # deprecated domain (doesn't redirect correctly) "://edoc.mpg.de/", ] @@ -216,15 +227,14 @@ class IngestFileWorker(SandcrawlerWorker): return None existing = self.pgrest_client.get_ingest_file_result(ingest_type, base_url) # TODO: filter on more flags? - if existing and existing['hit'] is True: + if existing and existing["hit"] is True: return existing else: return None - def find_resource(self, - url: str, - best_mimetype: Optional[str] = None, - force_recrawl: bool = False) -> Optional[ResourceResult]: + def find_resource( + self, url: str, best_mimetype: Optional[str] = None, force_recrawl: bool = False + ) -> Optional[ResourceResult]: """ Looks in wayback for a resource starting at the URL, following any redirects. If a hit isn't found, try crawling with SPN. @@ -233,7 +243,8 @@ class IngestFileWorker(SandcrawlerWorker): resource = None if url.startswith("http://web.archive.org/web/") or url.startswith( - "https://web.archive.org/web/"): + "https://web.archive.org/web/" + ): raise NotImplementedError("handling direct wayback links not supported yet") if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"): @@ -247,20 +258,32 @@ class IngestFileWorker(SandcrawlerWorker): soft404 = False # NOTE: these are often not working with SPNv2 either, so disabling. If # we really want to try again, should do force-recrawl - #if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'): + # if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'): # soft404 = True old_failure = False - if resource and not resource.hit and resource.terminal_dt and resource.terminal_dt < '20190000000000': + if ( + resource + and not resource.hit + and resource.terminal_dt + and resource.terminal_dt < "20190000000000" + ): old_failure = True - if self.try_spn2 and (resource is None or (resource and resource.status == 'no-capture') - or soft404 or old_failure): + if self.try_spn2 and ( + resource is None + or (resource and resource.status == "no-capture") + or soft404 + or old_failure + ): via = "spn2" resource = self.spn_client.crawl_resource(url, self.wayback_client) - print("[FETCH {:>6}] {} {}".format(via, (resource and resource.status), - (resource and resource.terminal_url) or url), - file=sys.stderr) + print( + "[FETCH {:>6}] {} {}".format( + via, (resource and resource.status), (resource and resource.terminal_url) or url + ), + file=sys.stderr, + ) return resource def process_existing(self, request: dict, result_row: dict) -> dict: @@ -269,51 +292,55 @@ class IngestFileWorker(SandcrawlerWorker): additional processing necessary to return a result. """ raise NotImplementedError("process_existing() not tested or safe yet") - assert result_row['hit'] - existing_file_meta = self.pgrest_client.get_file_meta(result_row['terminal_sha1hex']) - existing_grobid = self.pgrest_client.get_grobid(result_row['terminal_sha1hex']) - existing_cdx = self.pgrest_client.get_cdx(result_row['terminal_url'], - result_row['terminal_dt']) + assert result_row["hit"] + existing_file_meta = self.pgrest_client.get_file_meta(result_row["terminal_sha1hex"]) + existing_grobid = self.pgrest_client.get_grobid(result_row["terminal_sha1hex"]) + existing_cdx = self.pgrest_client.get_cdx( + result_row["terminal_url"], result_row["terminal_dt"] + ) if not (existing_file_meta and existing_grobid and existing_cdx): raise NotImplementedError("partially-exsiting records not implemented yet") result = { - 'hit': result_row['hit'], - 'status': "existing", - 'request': request, - 'grobid': existing_grobid, - 'file_meta': existing_file_meta, - 'cdx': existing_cdx, - 'terminal': { - 'terminal_url': result_row['terminal_url'], - 'terminal_dt': result_row['terminal_dt'], - 'terminal_status_code': result_row['terminal_status_code'], - 'terminal_sha1hex': result_row['terminal_sha1hex'], + "hit": result_row["hit"], + "status": "existing", + "request": request, + "grobid": existing_grobid, + "file_meta": existing_file_meta, + "cdx": existing_cdx, + "terminal": { + "terminal_url": result_row["terminal_url"], + "terminal_dt": result_row["terminal_dt"], + "terminal_status_code": result_row["terminal_status_code"], + "terminal_sha1hex": result_row["terminal_sha1hex"], }, } return result - def process_file_hit(self, ingest_type: str, resource: ResourceResult, - file_meta: dict) -> dict: + def process_file_hit( + self, ingest_type: str, resource: ResourceResult, file_meta: dict + ) -> dict: """ Run all the necessary processing for a new/fresh ingest hit. """ - if ingest_type in ["dataset-file", "component" - ] and file_meta['mimetype'] == "application/pdf": + if ( + ingest_type in ["dataset-file", "component"] + and file_meta["mimetype"] == "application/pdf" + ): ingest_type = "pdf" if ingest_type == "pdf": return { - 'grobid': self.process_grobid(resource, file_meta), - 'pdf_meta': self.process_pdfextract(resource, file_meta), + "grobid": self.process_grobid(resource, file_meta), + "pdf_meta": self.process_pdfextract(resource, file_meta), } elif ingest_type == "xml": return { - 'xml_meta': self.process_xml(resource, file_meta), + "xml_meta": self.process_xml(resource, file_meta), } elif ingest_type == "html": html_info = self.process_html(resource, file_meta) # if there is no html_biblio, don't clobber anything possibly extracted earlier - if 'html_biblio' in html_info and not html_info['html_biblio']: - html_info.pop('html_biblio') + if "html_biblio" in html_info and not html_info["html_biblio"]: + html_info.pop("html_biblio") return html_info elif ingest_type == "src": return {} @@ -332,7 +359,7 @@ class IngestFileWorker(SandcrawlerWorker): decide if we should re-process """ if self.try_existing_grobid: - existing = self.pgrest_client.get_grobid(file_meta['sha1hex']) + existing = self.pgrest_client.get_grobid(file_meta["sha1hex"]) if existing: print("found existing GROBID result", file=sys.stderr) return existing @@ -341,18 +368,18 @@ class IngestFileWorker(SandcrawlerWorker): result = self.grobid_client.process_fulltext(resource.body) if self.grobid_sink: # extra fields for GROBID kafka messages - result['file_meta'] = file_meta - result['key'] = result['file_meta']['sha1hex'] + result["file_meta"] = file_meta + result["key"] = result["file_meta"]["sha1hex"] self.grobid_sink.push_record(result.copy()) - if result['status'] == "success": + if result["status"] == "success": metadata = self.grobid_client.metadata(result) if metadata: - result['metadata'] = self.grobid_client.metadata(result) - result['fatcat_release'] = result['metadata'].pop('fatcat_release', None) - result['grobid_version'] = result['metadata'].pop('grobid_version', None) - result.pop('tei_xml', None) - result.pop('file_meta', None) - result.pop('key', None) + result["metadata"] = self.grobid_client.metadata(result) + result["fatcat_release"] = result["metadata"].pop("fatcat_release", None) + result["grobid_version"] = result["metadata"].pop("grobid_version", None) + result.pop("tei_xml", None) + result.pop("file_meta", None) + result.pop("key", None) return result def process_pdfextract(self, resource: ResourceResult, file_meta: dict) -> dict: @@ -365,7 +392,7 @@ class IngestFileWorker(SandcrawlerWorker): TODO: difference between Kafka schema and SQL/postgrest schema """ if self.try_existing_pdfextract: - existing = self.pgrest_client.get_pdf_meta(file_meta['sha1hex']) + existing = self.pgrest_client.get_pdf_meta(file_meta["sha1hex"]) if existing: print("found existing pdf_meta result", file=sys.stderr) result = PdfExtractResult.from_pdf_meta_dict(existing) @@ -373,9 +400,9 @@ class IngestFileWorker(SandcrawlerWorker): # Need to actually processes result = process_pdf(resource.body) - assert result.sha1hex == file_meta['sha1hex'] + assert result.sha1hex == file_meta["sha1hex"] assert result.file_meta is not None - assert result.file_meta['sha1hex'] == file_meta['sha1hex'] + assert result.file_meta["sha1hex"] == file_meta["sha1hex"] if self.thumbnail_sink and result.page0_thumbnail is not None: self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex) if self.pdftext_sink: @@ -392,7 +419,7 @@ class IngestFileWorker(SandcrawlerWorker): In the future, could extract other metadata here (like body word count), or attempting to fetch sub-resources. """ - if self.xmldoc_sink and file_meta['mimetype'] == "application/jats+xml": + if self.xmldoc_sink and file_meta["mimetype"] == "application/jats+xml": try: jats_xml = xml_reserialize(resource.body) except xml.etree.ElementTree.ParseError: @@ -402,7 +429,7 @@ class IngestFileWorker(SandcrawlerWorker): status="success", jats_xml=jats_xml, ) - self.xmldoc_sink.push_record(msg, key=file_meta['sha1hex']) + self.xmldoc_sink.push_record(msg, key=file_meta["sha1hex"]) return dict(status="success") def process_html(self, resource: ResourceResult, file_meta: dict) -> dict: @@ -416,11 +443,12 @@ class IngestFileWorker(SandcrawlerWorker): assert html_biblio html_body = html_extract_body_teixml(resource.body) html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio) - html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, - html_body.get('word_count')) + html_scope = html_guess_scope( + resource.terminal_url, html_doc, html_biblio, html_body.get("word_count") + ) html_biblio_dict = json.loads(html_biblio.json(exclude_none=True)) - if html_scope in ('blocked-captcha', 'blocked-cookie', 'blocked-forbidden'): + if html_scope in ("blocked-captcha", "blocked-cookie", "blocked-forbidden"): return dict( status=html_scope, html_biblio=html_biblio_dict, @@ -428,8 +456,8 @@ class IngestFileWorker(SandcrawlerWorker): platform=html_platform, ) elif html_scope not in ( - 'article-fulltext', - 'unknown', + "article-fulltext", + "unknown", ): html_body.pop("tei_xml", None) return dict( @@ -440,8 +468,9 @@ class IngestFileWorker(SandcrawlerWorker): html_body=html_body, ) - raw_resources = html_extract_resources(resource.terminal_url, html_doc, - self.adblock_rules) + raw_resources = html_extract_resources( + resource.terminal_url, html_doc, self.adblock_rules + ) if len(raw_resources) > self.max_html_resources: html_body.pop("tei_xml", None) return dict( @@ -452,8 +481,8 @@ class IngestFileWorker(SandcrawlerWorker): html_body=html_body, ) - if self.htmlteixml_sink and html_body['status'] == "success": - self.htmlteixml_sink.push_record(html_body, key=file_meta['sha1hex']) + if self.htmlteixml_sink and html_body["status"] == "success": + self.htmlteixml_sink.push_record(html_body, key=file_meta["sha1hex"]) html_body.pop("tei_xml", None) @@ -470,30 +499,30 @@ class IngestFileWorker(SandcrawlerWorker): try: if self.html_quick_mode: print(" WARN: running quick CDX-only fetches", file=sys.stderr) - full_resources = quick_fetch_html_resources(raw_resources, - self.wayback_client.cdx_client, - when) + full_resources = quick_fetch_html_resources( + raw_resources, self.wayback_client.cdx_client, when + ) else: full_resources = fetch_html_resources(raw_resources, self.wayback_client, when) except PetaboxError as e: - partial_result['status'] = 'petabox-error' - partial_result['error_message'] = str(e)[:1600] + partial_result["status"] = "petabox-error" + partial_result["error_message"] = str(e)[:1600] return partial_result except CdxApiError as e: - partial_result['status'] = 'cdx-error' - partial_result['error_message'] = str(e)[:1600] + partial_result["status"] = "cdx-error" + partial_result["error_message"] = str(e)[:1600] return partial_result except WaybackError as e: - partial_result['status'] = 'wayback-error' - partial_result['error_message'] = str(e)[:1600] + partial_result["status"] = "wayback-error" + partial_result["error_message"] = str(e)[:1600] return partial_result except WaybackContentError as e: - partial_result['status'] = 'wayback-content-error' - partial_result['error_message'] = str(e)[:1600] + partial_result["status"] = "wayback-content-error" + partial_result["error_message"] = str(e)[:1600] return partial_result except NoCaptureError as e: - partial_result['status'] = 'html-resource-no-capture' - partial_result['error_message'] = str(e)[:1600] + partial_result["status"] = "html-resource-no-capture" + partial_result["error_message"] = str(e)[:1600] return partial_result info = dict( @@ -503,8 +532,8 @@ class IngestFileWorker(SandcrawlerWorker): platform=html_platform, html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources], ) - if html_scope == 'unknown': - info['status'] = 'unknown-scope' + if html_scope == "unknown": + info["status"] = "unknown-scope" return info def timeout_response(self, task: dict) -> dict: @@ -517,7 +546,7 @@ class IngestFileWorker(SandcrawlerWorker): ) def want(self, request: dict) -> bool: - if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html', 'src', 'component'): + if not request.get("ingest_type") in ("file", "pdf", "xml", "html", "src", "component"): return False return True @@ -527,19 +556,19 @@ class IngestFileWorker(SandcrawlerWorker): def process_file(self, request: dict, key: Any = None) -> dict: # old backwards compatibility - if request.get('ingest_type') == 'file': - request['ingest_type'] = 'pdf' + if request.get("ingest_type") == "file": + request["ingest_type"] = "pdf" - ingest_type = request.get('ingest_type') + ingest_type = request.get("ingest_type") if ingest_type not in ("pdf", "xml", "html", "src", "component"): raise NotImplementedError(f"can't handle ingest_type={ingest_type}") # parse/clean URL # note that we pass through the original/raw URL, and that is what gets # persisted in database table - base_url = clean_url(request['base_url']) + base_url = clean_url(request["base_url"]) - force_recrawl = bool(request.get('force_recrawl', False)) + force_recrawl = bool(request.get("force_recrawl", False)) for block in self.base_url_blocklist: if block in base_url: @@ -569,112 +598,113 @@ class IngestFileWorker(SandcrawlerWorker): while len(hops) <= self.max_hops: - result['hops'] = hops + result["hops"] = hops # check against blocklist again on each hop for block in self.base_url_blocklist: if block in next_url: - result['status'] = "skip-url-blocklist" + result["status"] = "skip-url-blocklist" return result # check against known loginwall URLs for block in self.wall_blocklist: if block in next_url: # TODO: blocked-wall instead of skip-wall - result['status'] = "skip-wall" + result["status"] = "skip-wall" return result # check for popular cookie blocking URL patterns. On successful SPN # crawls, shouldn't see these redirect URLs for pattern in self.cookie_blocklist: if pattern in next_url: - result['status'] = 'blocked-cookie' + result["status"] = "blocked-cookie" return result try: - resource = self.find_resource(next_url, - best_mimetype, - force_recrawl=force_recrawl) + resource = self.find_resource( + next_url, best_mimetype, force_recrawl=force_recrawl + ) except SavePageNowError as e: - result['status'] = 'spn2-error' - result['error_message'] = str(e)[:1600] + result["status"] = "spn2-error" + result["error_message"] = str(e)[:1600] return result except PetaboxError as e: - result['status'] = 'petabox-error' - result['error_message'] = str(e)[:1600] + result["status"] = "petabox-error" + result["error_message"] = str(e)[:1600] return result except CdxApiError as e: - result['status'] = 'cdx-error' - result['error_message'] = str(e)[:1600] + result["status"] = "cdx-error" + result["error_message"] = str(e)[:1600] # add a sleep in cdx-error path as a slow-down time.sleep(2.0) return result except WaybackError as e: - result['status'] = 'wayback-error' - result['error_message'] = str(e)[:1600] + result["status"] = "wayback-error" + result["error_message"] = str(e)[:1600] return result except WaybackContentError as e: - result['status'] = 'wayback-content-error' - result['error_message'] = str(e)[:1600] + result["status"] = "wayback-content-error" + result["error_message"] = str(e)[:1600] return result except NotImplementedError as e: - result['status'] = 'not-implemented' - result['error_message'] = str(e)[:1600] + result["status"] = "not-implemented" + result["error_message"] = str(e)[:1600] return result assert resource if resource.terminal_url: - result['terminal'] = { + result["terminal"] = { "terminal_url": resource.terminal_url, "terminal_dt": resource.terminal_dt, "terminal_status_code": resource.terminal_status_code, } - if resource.terminal_url not in result['hops']: - result['hops'].append(resource.terminal_url) + if resource.terminal_url not in result["hops"]: + result["hops"].append(resource.terminal_url) if not resource.hit: - result['status'] = resource.status + result["status"] = resource.status return result if resource.terminal_url: for pattern in self.base_url_blocklist: if pattern in resource.terminal_url: - result['status'] = 'skip-url-blocklist' + result["status"] = "skip-url-blocklist" return result if resource.terminal_url: for pattern in self.cookie_blocklist: if pattern in resource.terminal_url: - result['status'] = 'blocked-cookie' + result["status"] = "blocked-cookie" return result if not resource.body: - result['status'] = 'null-body' + result["status"] = "null-body" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: - result['status'] = 'body-too-large' + result["status"] = "body-too-large" return result file_meta = gen_file_metadata(resource.body) try: file_meta, resource = fix_transfer_encoding(file_meta, resource) except Exception as e: - result['status'] = 'bad-gzip-encoding' - result['error_message'] = str(e) + result["status"] = "bad-gzip-encoding" + result["error_message"] = str(e) return result - if not resource.body or file_meta['size_bytes'] == 0: - result['status'] = 'null-body' + if not resource.body or file_meta["size_bytes"] == 0: + result["status"] = "null-body" return result # here we split based on ingest type to try and extract a next hop html_ish_resource = bool( - "html" in file_meta['mimetype'] - or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml" - or "application/xml" in file_meta['mimetype'] - or "text/xml" in file_meta['mimetype']) + "html" in file_meta["mimetype"] + or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml" + or "application/xml" in file_meta["mimetype"] + or "text/xml" in file_meta["mimetype"] + ) html_biblio = None html_doc = None if html_ish_resource and resource.body: @@ -682,10 +712,11 @@ class IngestFileWorker(SandcrawlerWorker): html_doc = HTMLParser(resource.body) html_biblio = html_extract_biblio(resource.terminal_url, html_doc) if html_biblio: - if 'html_biblio' not in result and html_biblio.title: - result['html_biblio'] = json.loads( - html_biblio.json(exclude_none=True)) - #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) + if "html_biblio" not in result and html_biblio.title: + result["html_biblio"] = json.loads( + html_biblio.json(exclude_none=True) + ) + # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) except ValueError: pass @@ -700,27 +731,32 @@ class IngestFileWorker(SandcrawlerWorker): else: fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) - result['extract_next_hop'] = fulltext_url + result["extract_next_hop"] = fulltext_url if not fulltext_url: - result['status'] = 'no-pdf-link' + result["status"] = "no-pdf-link" return result - next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') or "" + next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or "" assert next_url next_url = clean_url(next_url) - print("[PARSE {:>6}] {} {}".format( - ingest_type, - fulltext_url.get('technique'), - next_url, - ), - file=sys.stderr) + print( + "[PARSE {:>6}] {} {}".format( + ingest_type, + fulltext_url.get("technique"), + next_url, + ), + file=sys.stderr, + ) if next_url in hops: - result['status'] = 'link-loop' - result['error_message'] = "repeated: {}".format(next_url) + result["status"] = "link-loop" + result["error_message"] = "repeated: {}".format(next_url) return result hops.append(next_url) continue - elif ingest_type in ("xml", "html", - "component") and html_ish_resource and html_biblio: + elif ( + ingest_type in ("xml", "html", "component") + and html_ish_resource + and html_biblio + ): # NOTE: src_fulltext_url is not a thing next_url_found = None if ingest_type == "xml" and html_biblio.xml_fulltext_url: @@ -733,18 +769,20 @@ class IngestFileWorker(SandcrawlerWorker): if next_url_found: next_url = next_url_found technique = "html_biblio" - print("[PARSE {:>6}] {} {}".format( - ingest_type, - technique, - next_url, - ), - file=sys.stderr) + print( + "[PARSE {:>6}] {} {}".format( + ingest_type, + technique, + next_url, + ), + file=sys.stderr, + ) if next_url in hops: if ingest_type == "html": # for HTML ingest, we don't count this as a link-loop break - result['status'] = 'link-loop' - result['error_message'] = "repeated: {}".format(next_url) + result["status"] = "link-loop" + result["error_message"] = "repeated: {}".format(next_url) return result hops.append(next_url) continue @@ -753,7 +791,7 @@ class IngestFileWorker(SandcrawlerWorker): break if len(hops) >= self.max_hops: - result['status'] = "max-hops-exceeded" + result["status"] = "max-hops-exceeded" return result # fetch must be a hit if we got this far (though not necessarily an ingest hit!) @@ -762,38 +800,41 @@ class IngestFileWorker(SandcrawlerWorker): assert resource.terminal_status_code in (200, 226) if resource.terminal_url: - result['terminal'] = { + result["terminal"] = { "terminal_url": resource.terminal_url, "terminal_dt": resource.terminal_dt, "terminal_status_code": resource.terminal_status_code, - "terminal_sha1hex": file_meta['sha1hex'], + "terminal_sha1hex": file_meta["sha1hex"], } - result['file_meta'] = file_meta - result['cdx'] = cdx_to_dict(resource.cdx) + result["file_meta"] = file_meta + result["cdx"] = cdx_to_dict(resource.cdx) if resource.revisit_cdx: - result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx) + result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx) if ingest_type == "pdf": - if file_meta['mimetype'] != "application/pdf": - result['status'] = "wrong-mimetype" # formerly: "other-mimetype" + if file_meta["mimetype"] != "application/pdf": + result["status"] = "wrong-mimetype" # formerly: "other-mimetype" return result elif ingest_type == "xml": - if file_meta['mimetype'] not in ("application/xml", "text/xml", - "application/jats+xml"): - result['status'] = "wrong-mimetype" + if file_meta["mimetype"] not in ( + "application/xml", + "text/xml", + "application/jats+xml", + ): + result["status"] = "wrong-mimetype" return result elif ingest_type == "html": - if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): - result['status'] = "wrong-mimetype" + if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"): + result["status"] = "wrong-mimetype" return result elif ingest_type == "src": - if file_meta['mimetype'] not in self.src_valid_mimetypes: - result['status'] = "wrong-mimetype" + if file_meta["mimetype"] not in self.src_valid_mimetypes: + result["status"] = "wrong-mimetype" return result elif ingest_type == "component": - if file_meta['mimetype'] not in self.component_valid_mimetypes: - result['status'] = "wrong-mimetype" + if file_meta["mimetype"] not in self.component_valid_mimetypes: + result["status"] = "wrong-mimetype" return result else: raise NotImplementedError() @@ -802,26 +843,30 @@ class IngestFileWorker(SandcrawlerWorker): result.update(info) # check if processing turned up an error - if info.get('status') not in ('success', None): - result['status'] = info['status'] + if info.get("status") not in ("success", None): + result["status"] = info["status"] return result - result['status'] = "success" - result['hit'] = True + result["status"] = "success" + result["hit"] = True if ingest_type == "pdf": - print("[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format( - ingest_type, - result.get('file_meta', {}).get('sha1hex'), - result.get('grobid', {}).get('status_code'), - result.get('pdf_meta', {}).get('status'), - ), - file=sys.stderr) + print( + "[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format( + ingest_type, + result.get("file_meta", {}).get("sha1hex"), + result.get("grobid", {}).get("status_code"), + result.get("pdf_meta", {}).get("status"), + ), + file=sys.stderr, + ) else: - print("[SUCCESS {:>5}] sha1:{}".format( - ingest_type, - result.get('file_meta', {}).get('sha1hex'), - ), - file=sys.stderr) + print( + "[SUCCESS {:>5}] sha1:{}".format( + ingest_type, + result.get("file_meta", {}).get("sha1hex"), + ), + file=sys.stderr, + ) return result @@ -832,11 +877,11 @@ class IngestFileRequestHandler(BaseHTTPRequestHandler): self.end_headers() self.wfile.write(b"404: Not Found") return - length = int(self.headers.get('content-length')) - request = json.loads(self.rfile.read(length).decode('utf-8')) + length = int(self.headers.get("content-length")) + request = json.loads(self.rfile.read(length).decode("utf-8")) print("Got request: {}".format(request)) ingester = IngestFileWorker() result = ingester.process(request) self.send_response(200) self.end_headers() - self.wfile.write(json.dumps(result).encode('utf8')) + self.wfile.write(json.dumps(result).encode("utf8")) diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 5728e24..227f511 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -6,15 +6,33 @@ from typing import Any, Dict, Optional import requests from selectolax.parser import HTMLParser -from sandcrawler.fileset_platforms import (ArchiveOrgHelper, DataverseHelper, FigshareHelper, - ZenodoHelper) -from sandcrawler.fileset_strategies import (ArchiveorgFilesetStrategy, ArchiveorgFileStrategy, - WebFilesetStrategy, WebFileStrategy) -from sandcrawler.fileset_types import (IngestStrategy, PlatformRestrictedError, - PlatformScopeError) +from sandcrawler.fileset_platforms import ( + ArchiveOrgHelper, + DataverseHelper, + FigshareHelper, + ZenodoHelper, +) +from sandcrawler.fileset_strategies import ( + ArchiveorgFilesetStrategy, + ArchiveorgFileStrategy, + WebFilesetStrategy, + WebFileStrategy, +) +from sandcrawler.fileset_types import ( + IngestStrategy, + PlatformRestrictedError, + PlatformScopeError, +) from sandcrawler.html_metadata import html_extract_biblio -from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, WaybackContentError, - WaybackError, cdx_to_dict, fix_transfer_encoding) +from sandcrawler.ia import ( + CdxApiError, + PetaboxError, + SavePageNowError, + WaybackContentError, + WaybackError, + cdx_to_dict, + fix_transfer_encoding, +) from sandcrawler.ingest_file import IngestFileWorker from sandcrawler.misc import clean_url, gen_file_metadata from sandcrawler.workers import SandcrawlerWorker @@ -35,15 +53,16 @@ class IngestFilesetWorker(IngestFileWorker): checking to see if content has been archived already) 4. summarize status """ + def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs): super().__init__(sink=None, **kwargs) self.sink = sink self.dataset_platform_helpers = { - 'dataverse': DataverseHelper(), - 'figshare': FigshareHelper(), - 'zenodo': ZenodoHelper(), - 'archiveorg': ArchiveOrgHelper(), + "dataverse": DataverseHelper(), + "figshare": FigshareHelper(), + "zenodo": ZenodoHelper(), + "archiveorg": ArchiveOrgHelper(), } self.dataset_strategy_archivers = { IngestStrategy.ArchiveorgFileset: ArchiveorgFilesetStrategy(), @@ -52,10 +71,10 @@ class IngestFilesetWorker(IngestFileWorker): IngestStrategy.WebFile: WebFileStrategy(), } - self.max_total_size = kwargs.get('max_total_size', 64 * 1024 * 1024 * 1024) - self.max_file_count = kwargs.get('max_file_count', 200) - self.ingest_file_result_sink = kwargs.get('ingest_file_result_sink') - self.ingest_file_result_stdout = kwargs.get('ingest_file_result_stdout', False) + self.max_total_size = kwargs.get("max_total_size", 64 * 1024 * 1024 * 1024) + self.max_file_count = kwargs.get("max_file_count", 200) + self.ingest_file_result_sink = kwargs.get("ingest_file_result_sink") + self.ingest_file_result_stdout = kwargs.get("ingest_file_result_stdout", False) def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]: """ @@ -65,7 +84,7 @@ class IngestFilesetWorker(IngestFileWorker): return None existing = self.pgrest_client.get_ingest_fileset_platform(ingest_type, base_url) # TODO: filter on more flags? - if existing and existing['hit'] is True: + if existing and existing["hit"] is True: return existing else: return None @@ -78,112 +97,114 @@ class IngestFilesetWorker(IngestFileWorker): raise NotImplementedError("process_existing() not tested or safe yet") def want(self, request: dict) -> bool: - if not request.get('ingest_type') in ('dataset', ): + if not request.get("ingest_type") in ("dataset",): return False return True - def fetch_resource_iteratively(self, ingest_type: str, base_url: str, - force_recrawl: bool) -> dict: + def fetch_resource_iteratively( + self, ingest_type: str, base_url: str, force_recrawl: bool + ) -> dict: """ This is copypasta from process_file(), should probably refactor. """ result: Dict[str, Any] = dict(hit=False) - result['hops'] = [base_url] + result["hops"] = [base_url] next_url = base_url # check against blocklist for block in self.base_url_blocklist: # NOTE: hack to not skip archive.org content - if 'archive.org' in block: + if "archive.org" in block: continue if block in next_url: - result['status'] = "skip-url-blocklist" + result["status"] = "skip-url-blocklist" return result try: resource = self.find_resource(next_url, force_recrawl=force_recrawl) except SavePageNowError as e: - result['status'] = 'spn2-error' - result['error_message'] = str(e)[:1600] + result["status"] = "spn2-error" + result["error_message"] = str(e)[:1600] return result except PetaboxError as e: - result['status'] = 'petabox-error' - result['error_message'] = str(e)[:1600] + result["status"] = "petabox-error" + result["error_message"] = str(e)[:1600] return result except CdxApiError as e: - result['status'] = 'cdx-error' - result['error_message'] = str(e)[:1600] + result["status"] = "cdx-error" + result["error_message"] = str(e)[:1600] # add a sleep in cdx-error path as a slow-down time.sleep(2.0) return result except WaybackError as e: - result['status'] = 'wayback-error' - result['error_message'] = str(e)[:1600] + result["status"] = "wayback-error" + result["error_message"] = str(e)[:1600] return result except WaybackContentError as e: - result['status'] = 'wayback-content-error' - result['error_message'] = str(e)[:1600] + result["status"] = "wayback-content-error" + result["error_message"] = str(e)[:1600] return result except NotImplementedError: - #result['status'] = 'not-implemented' - #result['error_message'] = str(e)[:1600] - #return result + # result['status'] = 'not-implemented' + # result['error_message'] = str(e)[:1600] + # return result resource = None html_biblio = None if resource: if resource.terminal_url: - result['terminal'] = { + result["terminal"] = { "terminal_url": resource.terminal_url, "terminal_dt": resource.terminal_dt, "terminal_status_code": resource.terminal_status_code, } - if resource.terminal_url not in result['hops']: - result['hops'].append(resource.terminal_url) + if resource.terminal_url not in result["hops"]: + result["hops"].append(resource.terminal_url) if not resource.hit: - result['status'] = resource.status + result["status"] = resource.status return result if resource.terminal_url: for pattern in self.base_url_blocklist: if pattern in resource.terminal_url: - result['status'] = 'skip-url-blocklist' + result["status"] = "skip-url-blocklist" return result if resource.terminal_url: for pattern in self.cookie_blocklist: if pattern in resource.terminal_url: - result['status'] = 'blocked-cookie' + result["status"] = "blocked-cookie" return result if not resource.body: - result['status'] = 'null-body' + result["status"] = "null-body" return result if len(resource.body) > MAX_BODY_SIZE_BYTES: - result['status'] = 'body-too-large' + result["status"] = "body-too-large" return result file_meta = gen_file_metadata(resource.body) try: file_meta, resource = fix_transfer_encoding(file_meta, resource) except Exception as e: - result['status'] = 'bad-gzip-encoding' - result['error_message'] = str(e) + result["status"] = "bad-gzip-encoding" + result["error_message"] = str(e) return result - if not resource.body or file_meta['size_bytes'] == 0: - result['status'] = 'null-body' + if not resource.body or file_meta["size_bytes"] == 0: + result["status"] = "null-body" return result # here we split based on ingest type to try and extract a next hop html_ish_resource = bool( - "html" in file_meta['mimetype'] - or "xhtml" in file_meta['mimetype'] # matches "application/xhtml+xml" - or "application/xml" in file_meta['mimetype'] - or "text/xml" in file_meta['mimetype']) + "html" in file_meta["mimetype"] + or "xhtml" in file_meta["mimetype"] # matches "application/xhtml+xml" + or "application/xml" in file_meta["mimetype"] + or "text/xml" in file_meta["mimetype"] + ) html_biblio = None html_doc = None if html_ish_resource and resource.body: @@ -191,10 +212,11 @@ class IngestFilesetWorker(IngestFileWorker): html_doc = HTMLParser(resource.body) html_biblio = html_extract_biblio(resource.terminal_url, html_doc) if html_biblio: - if 'html_biblio' not in result and html_biblio.title: - result['html_biblio'] = json.loads( - html_biblio.json(exclude_none=True)) - #print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) + if "html_biblio" not in result and html_biblio.title: + result["html_biblio"] = json.loads( + html_biblio.json(exclude_none=True) + ) + # print(f" setting html_biblio: {result['html_biblio']}", file=sys.stderr) except ValueError: pass @@ -204,69 +226,72 @@ class IngestFilesetWorker(IngestFileWorker): assert resource.terminal_status_code in (200, 226) if resource.terminal_url: - result['terminal'] = { + result["terminal"] = { "terminal_url": resource.terminal_url, "terminal_dt": resource.terminal_dt, "terminal_status_code": resource.terminal_status_code, - "terminal_sha1hex": file_meta['sha1hex'], + "terminal_sha1hex": file_meta["sha1hex"], } - result['file_meta'] = file_meta - result['cdx'] = cdx_to_dict(resource.cdx) + result["file_meta"] = file_meta + result["cdx"] = cdx_to_dict(resource.cdx) if resource.revisit_cdx: - result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx) + result["revisit_cdx"] = cdx_to_dict(resource.revisit_cdx) if ingest_type == "pdf": - if file_meta['mimetype'] != "application/pdf": - result['status'] = "wrong-mimetype" # formerly: "other-mimetype" + if file_meta["mimetype"] != "application/pdf": + result["status"] = "wrong-mimetype" # formerly: "other-mimetype" return result elif ingest_type == "xml": - if file_meta['mimetype'] not in ("application/xml", "text/xml", - "application/jats+xml"): - result['status'] = "wrong-mimetype" + if file_meta["mimetype"] not in ( + "application/xml", + "text/xml", + "application/jats+xml", + ): + result["status"] = "wrong-mimetype" return result elif ingest_type == "html": - if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): - result['status'] = "wrong-mimetype" + if file_meta["mimetype"] not in ("text/html", "application/xhtml+xml"): + result["status"] = "wrong-mimetype" return result else: - #raise NotImplementedError() + # raise NotImplementedError() pass - result['_html_biblio'] = html_biblio - result['_resource'] = resource + result["_html_biblio"] = html_biblio + result["_resource"] = resource return result def process(self, request: dict, key: Any = None) -> dict: - ingest_type = request.get('ingest_type') - if ingest_type not in ("dataset", ): + ingest_type = request.get("ingest_type") + if ingest_type not in ("dataset",): raise NotImplementedError(f"can't handle ingest_type={ingest_type}") # parse/clean URL # note that we pass through the original/raw URL, and that is what gets # persisted in database table - base_url = clean_url(request['base_url']) + base_url = clean_url(request["base_url"]) - force_recrawl = bool(request.get('force_recrawl', False)) + force_recrawl = bool(request.get("force_recrawl", False)) print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr) # TODO: "existing" check against file and/or fileset ingest result table - #existing = self.check_existing_ingest(ingest_type, base_url) - #if existing: + # existing = self.check_existing_ingest(ingest_type, base_url) + # if existing: # return self.process_existing(request, existing) - result = self.fetch_resource_iteratively(ingest_type, - base_url, - force_recrawl=force_recrawl) - result['request'] = request - if result.get('status') is not None: - result['request'] = request + result = self.fetch_resource_iteratively( + ingest_type, base_url, force_recrawl=force_recrawl + ) + result["request"] = request + if result.get("status") is not None: + result["request"] = request return result - html_biblio = result.pop('_html_biblio') - resource = result.pop('_resource') + html_biblio = result.pop("_html_biblio") + resource = result.pop("_resource") # 1. Determine `platform`, which may involve resolving redirects and crawling a landing page. @@ -280,166 +305,183 @@ class IngestFilesetWorker(IngestFileWorker): break if not platform_helper: - result['status'] = 'no-platform-match' + result["status"] = "no-platform-match" return result # 2. Use platform-specific methods to fetch manifest metadata and decide on an `ingest_strategy`. try: dataset_meta = platform_helper.process_request(request, resource, html_biblio) except PlatformScopeError as e: - result['status'] = 'platform-scope' - result['error_message'] = str(e)[:1600] + result["status"] = "platform-scope" + result["error_message"] = str(e)[:1600] return result except PlatformRestrictedError as e: - result['status'] = 'platform-restricted' - result['error_message'] = str(e)[:1600] + result["status"] = "platform-restricted" + result["error_message"] = str(e)[:1600] return result except NotImplementedError as e: - result['status'] = 'not-implemented' - result['error_message'] = str(e)[:1600] + result["status"] = "not-implemented" + result["error_message"] = str(e)[:1600] return result except requests.exceptions.HTTPError as e: if e.response.status_code == 404: - result['status'] = 'platform-404' - result['error_message'] = str(e)[:1600] + result["status"] = "platform-404" + result["error_message"] = str(e)[:1600] return result else: raise e - #print(dataset_meta, file=sys.stderr) + # print(dataset_meta, file=sys.stderr) platform = dataset_meta.platform_name - result['platform_name'] = dataset_meta.platform_name - result['platform_domain'] = dataset_meta.platform_domain - result['platform_id'] = dataset_meta.platform_id - result['platform_base_url'] = dataset_meta.web_base_url - result['archiveorg_item_name'] = dataset_meta.archiveorg_item_name + result["platform_name"] = dataset_meta.platform_name + result["platform_domain"] = dataset_meta.platform_domain + result["platform_id"] = dataset_meta.platform_id + result["platform_base_url"] = dataset_meta.web_base_url + result["archiveorg_item_name"] = dataset_meta.archiveorg_item_name if not dataset_meta.manifest: - result['status'] = 'empty-manifest' + result["status"] = "empty-manifest" return result # these will get confirmed/updated after ingest - result['manifest'] = [m.dict(exclude_none=True) for m in dataset_meta.manifest] - result['file_count'] = len(dataset_meta.manifest) - result['total_size'] = sum([m.size for m in dataset_meta.manifest if m.size]) + result["manifest"] = [m.dict(exclude_none=True) for m in dataset_meta.manifest] + result["file_count"] = len(dataset_meta.manifest) + result["total_size"] = sum([m.size for m in dataset_meta.manifest if m.size]) - if result['total_size'] > self.max_total_size: - result['status'] = 'too-large-size' + if result["total_size"] > self.max_total_size: + result["status"] = "too-large-size" return result - if result['file_count'] > self.max_file_count: + if result["file_count"] > self.max_file_count: # hard max, to prevent downstream breakage - if result['file_count'] > 10 * 1000: - result['manifest'] = result['manifest'][:self.max_file_count] - result['status'] = 'too-many-files' + if result["file_count"] > 10 * 1000: + result["manifest"] = result["manifest"][: self.max_file_count] + result["status"] = "too-many-files" return result ingest_strategy = platform_helper.chose_strategy(dataset_meta) - result['ingest_strategy'] = ingest_strategy + result["ingest_strategy"] = ingest_strategy print( f"[PLATFORM {platform}] id={dataset_meta.platform_id} file_count={result['file_count']} total_size={result['total_size']} strategy={ingest_strategy}", - file=sys.stderr) + file=sys.stderr, + ) strategy_helper = self.dataset_strategy_archivers.get(ingest_strategy) if not strategy_helper: - result['status'] = 'no-strategy-helper' + result["status"] = "no-strategy-helper" return result # 3. Use strategy-specific methods to archive all files in platform manifest, and verify manifest metadata. archive_result = strategy_helper.process(dataset_meta) # 4. Summarize status and return structured result metadata. - result['status'] = archive_result.status - result['manifest'] = [m.dict(exclude_none=True) for m in archive_result.manifest] + result["status"] = archive_result.status + result["manifest"] = [m.dict(exclude_none=True) for m in archive_result.manifest] - if ingest_strategy.endswith('-fileset-bundle'): - result['fileset_bundle'] = dict() + if ingest_strategy.endswith("-fileset-bundle"): + result["fileset_bundle"] = dict() if archive_result.bundle_file_meta: - result['fileset_bundle']['file_meta'] = archive_result.bundle_file_meta + result["fileset_bundle"]["file_meta"] = archive_result.bundle_file_meta if archive_result.bundle_archiveorg_path: - result['fileset_bundle'][ - 'archiveorg_bundle_path'] = archive_result.bundle_archiveorg_path + result["fileset_bundle"][ + "archiveorg_bundle_path" + ] = archive_result.bundle_archiveorg_path if archive_result.bundle_resource: - result['fileset_bundle']['terminal'] = dict( + result["fileset_bundle"]["terminal"] = dict( terminal_url=archive_result.bundle_resource.terminal_url, terminal_dt=archive_result.bundle_resource.terminal_dt, terminal_status_code=archive_result.bundle_resource.terminal_status_code, ) if archive_result.bundle_resource.cdx: - result['fileset_bundle']['cdx'] = cdx_to_dict( - archive_result.bundle_resource.cdx) + result["fileset_bundle"]["cdx"] = cdx_to_dict( + archive_result.bundle_resource.cdx + ) if archive_result.bundle_resource.revisit_cdx: - result['fileset_bundle']['revisit_cdx'] = cdx_to_dict( - archive_result.bundle_resource.revisit_cdx) + result["fileset_bundle"]["revisit_cdx"] = cdx_to_dict( + archive_result.bundle_resource.revisit_cdx + ) - if ingest_strategy.endswith('-file'): - result['fileset_file'] = dict() + if ingest_strategy.endswith("-file"): + result["fileset_file"] = dict() if archive_result.file_file_meta: - result['fileset_file']['file_meta'] = archive_result.file_file_meta, + result["fileset_file"]["file_meta"] = (archive_result.file_file_meta,) if archive_result.file_resource: - result['fileset_file']['terminal'] = dict( + result["fileset_file"]["terminal"] = dict( terminal_url=archive_result.file_resource.terminal_url, terminal_dt=archive_result.file_resource.terminal_dt, terminal_status_code=archive_result.file_resource.terminal_status_code, ) if archive_result.file_resource.cdx: - result['fileset_file']['cdx'] = cdx_to_dict( - archive_result.file_resource.cdx) + result["fileset_file"]["cdx"] = cdx_to_dict( + archive_result.file_resource.cdx + ) if archive_result.file_resource.revisit_cdx: - result['fileset_file']['revisit_cdx'] = cdx_to_dict( - archive_result.file_resource.revisit_cdx) + result["fileset_file"]["revisit_cdx"] = cdx_to_dict( + archive_result.file_resource.revisit_cdx + ) - if result['status'].startswith('success'): + if result["status"].startswith("success"): # check that these are still valid - assert result['file_count'] == len(archive_result.manifest) - assert result['total_size'] == sum( - [m.size for m in archive_result.manifest if m.size]) + assert result["file_count"] == len(archive_result.manifest) + assert result["total_size"] == sum( + [m.size for m in archive_result.manifest if m.size] + ) - if result[ - 'status'] == 'success-file' and archive_result.file_resource and archive_result.file_file_meta: + if ( + result["status"] == "success-file" + and archive_result.file_resource + and archive_result.file_file_meta + ): file_result: Dict[str, Any] = dict( hit=True, - status='success', + status="success", request=request.copy(), file_meta=archive_result.file_file_meta, terminal=dict( terminal_url=archive_result.file_resource.terminal_url, terminal_dt=archive_result.file_resource.terminal_dt, terminal_status_code=archive_result.file_resource.terminal_status_code, - terminal_sha1hex=archive_result.file_file_meta['sha1hex'], + terminal_sha1hex=archive_result.file_file_meta["sha1hex"], ), ) if archive_result.file_resource.cdx: - file_result['cdx'] = cdx_to_dict(archive_result.file_resource.cdx) + file_result["cdx"] = cdx_to_dict(archive_result.file_resource.cdx) if archive_result.file_resource.revisit_cdx: - file_result['revisit_cdx'] = cdx_to_dict( - archive_result.file_resource.revisit_cdx) - file_result['request']['ingest_type'] = request['ingest_type'] + "-file" + file_result["revisit_cdx"] = cdx_to_dict( + archive_result.file_resource.revisit_cdx + ) + file_result["request"]["ingest_type"] = request["ingest_type"] + "-file" # call the super() (ingest_file) version of process_hit() - info = self.process_file_hit(file_result['request']['ingest_type'], - archive_result.file_resource, - archive_result.file_file_meta) + info = self.process_file_hit( + file_result["request"]["ingest_type"], + archive_result.file_resource, + archive_result.file_file_meta, + ) file_result.update(info) if self.ingest_file_result_sink: self.ingest_file_result_sink.push_record(result.copy()) elif self.ingest_file_result_stdout: sys.stdout.write(json.dumps(file_result, sort_keys=True) + "\n") - if result['status'].startswith('success'): - result['hit'] = True - print("[SUCCESS {:>5}] file_count={} total_size={} strategy={}".format( - ingest_type, - result['file_count'], - result['total_size'], - ingest_strategy, - ), - file=sys.stderr) + if result["status"].startswith("success"): + result["hit"] = True + print( + "[SUCCESS {:>5}] file_count={} total_size={} strategy={}".format( + ingest_type, + result["file_count"], + result["total_size"], + ingest_strategy, + ), + file=sys.stderr, + ) else: - print("[FAIL {:>5}] status={} file_count={} total_size={} strategy={}".format( - ingest_type, - result['status'], - result['file_count'], - result['total_size'], - ingest_strategy, - ), - file=sys.stderr) + print( + "[FAIL {:>5}] status={} file_count={} total_size={} strategy={}".format( + ingest_type, + result["status"], + result["file_count"], + result["total_size"], + ingest_strategy, + ), + file=sys.stderr, + ) return result diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 91e5c6e..0ff7fe0 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -9,12 +9,26 @@ import pydantic import trafilatura from selectolax.parser import HTMLParser -from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio, - html_extract_resources, load_adblock_rules) -from sandcrawler.ia import (CdxApiClient, NoCaptureError, WaybackClient, WaybackContentError, - cdx_to_dict, fix_transfer_encoding) -from sandcrawler.misc import (datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, - url_fuzzy_equal) +from sandcrawler.html_metadata import ( + BiblioMetadata, + html_extract_biblio, + html_extract_resources, + load_adblock_rules, +) +from sandcrawler.ia import ( + CdxApiClient, + NoCaptureError, + WaybackClient, + WaybackContentError, + cdx_to_dict, + fix_transfer_encoding, +) +from sandcrawler.misc import ( + datetime_to_cdx, + gen_file_metadata, + parse_cdx_datetime, + url_fuzzy_equal, +) TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" @@ -23,7 +37,7 @@ def html_extract_body_teixml(doc: bytes) -> dict: try: tei_xml = trafilatura.extract( doc, - output_format='xmltei', + output_format="xmltei", include_comments=False, include_formatting=True, ) @@ -35,12 +49,11 @@ def html_extract_body_teixml(doc: bytes) -> dict: if tei_xml: body_txt = teixml_body_text(tei_xml) word_count = len(body_txt.split()) - return dict(status="success", - agent=TRAFILATURA_AGENT, - tei_xml=tei_xml, - word_count=word_count) + return dict( + status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count + ) elif doc.startswith( - b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">' + b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">' ): # hack for firstmonday.org return html_extract_body_teixml(doc[106:]) @@ -51,7 +64,7 @@ def html_extract_body_teixml(doc: bytes) -> dict: def teixml_body_text(doc_xml: str) -> str: ns = {"tei": "http://www.tei-c.org/ns/1.0"} tree = ET.fromstring(doc_xml) - body = tree.find('.//tei:body', ns) + body = tree.find(".//tei:body", ns) if body: return " ".join(body.itertext()) else: @@ -126,8 +139,9 @@ class HtmlMetaRow(pydantic.BaseModel): ) -def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, - when: Optional[datetime.datetime]) -> List[WebResource]: +def quick_fetch_html_resources( + resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime] +) -> List[WebResource]: """ This is the lazy version that just does a CDX lookup for each resource. @@ -138,12 +152,13 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, full = [] closest = when and datetime_to_cdx(when) for resource in resources: - cdx_row = cdx_client.lookup_best(resource['url'], closest=closest) + cdx_row = cdx_client.lookup_best(resource["url"], closest=closest) if not cdx_row: raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}") - if cdx_row.url != resource['url'] and not url_fuzzy_equal(cdx_row.url, resource['url']): - print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", - file=sys.stderr) + if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]): + print( + f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr + ) if not cdx_row.status_code: # TODO: fall back to a full fetch? print(" WARN: skipping revisit record", file=sys.stderr) @@ -158,14 +173,16 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, status_code=cdx_row.status_code, size=None, sha256hex=None, - resource_type=resource['type'], - )) + resource_type=resource["type"], + ) + ) return full -def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, - when: Optional[datetime.datetime]) -> List[WebResource]: +def fetch_html_resources( + resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime] +) -> List[WebResource]: """ This is the full version which fetches each resource from wayback/petabox and calculates additional hashes. @@ -176,11 +193,11 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, full = [] closest = when and datetime_to_cdx(when) for resource in resources: - wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest) - if not wayback_resp or wayback_resp.status != 'success': + wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest) + if not wayback_resp or wayback_resp.status != "success": raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}") file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True) - if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex: + if file_meta["sha1hex"] != wayback_resp.cdx.sha1hex: raise WaybackContentError( f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}" ) @@ -189,25 +206,27 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, surt=wayback_resp.cdx.surt, timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime), url=wayback_resp.cdx.url, - sha1hex=file_meta['sha1hex'], - mimetype=file_meta['mimetype'], + sha1hex=file_meta["sha1hex"], + mimetype=file_meta["mimetype"], status_code=wayback_resp.cdx.status_code or wayback_resp.revisit_cdx.status_code, - size=file_meta['size_bytes'], - sha256hex=file_meta['sha256hex'], - resource_type=resource['type'], - )) + size=file_meta["size_bytes"], + sha256hex=file_meta["sha256hex"], + resource_type=resource["type"], + ) + ) return full -def html_guess_platform(url: str, doc: HTMLParser, - biblio: Optional[BiblioMetadata]) -> Optional[str]: +def html_guess_platform( + url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] +) -> Optional[str]: generator: Optional[str] = None generator_elem = doc.css_first("meta[name='generator']") if generator_elem: - generator = generator_elem.attrs['content'] + generator = generator_elem.attrs["content"] else: generator_elem = doc.css_first("a[id='developedBy']") if generator_elem: @@ -226,7 +245,10 @@ def html_guess_platform(url: str, doc: HTMLParser, return "ojs" else: try: - if 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html: + if ( + 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' + in doc.html + ): return "ojs" if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html: return "arpha" @@ -236,20 +258,21 @@ def html_guess_platform(url: str, doc: HTMLParser, pass icon_elem = doc.css_first("link[type='image/x-icon']") - if icon_elem and 'href' in icon_elem.attrs: - if 'journalssystem.com' in icon_elem.attrs['href']: + if icon_elem and "href" in icon_elem.attrs: + if "journalssystem.com" in icon_elem.attrs["href"]: return "journalssystem.com" - elif 'indexcopernicus.com' in icon_elem.attrs['href']: + elif "indexcopernicus.com" in icon_elem.attrs["href"]: return "indexcopernicus" - if 'scielo' in url: + if "scielo" in url: return "scielo" return None -def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], - word_count: Optional[int]) -> str: +def html_guess_scope( + url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int] +) -> str: """ This function tries to guess if an HTML document represents one of: @@ -275,7 +298,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] """ # assert that this is a real URL - assert url.count('/') >= 2 + assert url.count("/") >= 2 # basic paywall and loginwall detection based on URL if url.endswith("/cookieAbsent"): @@ -293,7 +316,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] return "blocked-captcha" # is this the top-level URL of the domain? aka, no path? - if url.count('/') <= 2 or (url.count('/') == 3) and url.endswith('/'): + if url.count("/") <= 2 or (url.count("/") == 3) and url.endswith("/"): return "homepage-domain" platform = html_guess_platform(url, doc, biblio) @@ -340,7 +363,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] if word_count is not None: if word_count < 20: return "stub" - elif word_count > 500 and platform in ['wordpress', 'blogger']: + elif word_count > 500 and platform in ["wordpress", "blogger"]: return "article-fulltext" elif word_count > 1200: return "article-fulltext" @@ -348,9 +371,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] return "unknown" -def run_single(url: str, - timestamp: Optional[str] = None, - quick_mode: bool = False) -> IngestWebResult: +def run_single( + url: str, timestamp: Optional[str] = None, quick_mode: bool = False +) -> IngestWebResult: adblock = load_adblock_rules() wayback_client = WaybackClient() @@ -368,7 +391,7 @@ def run_single(url: str, file_meta = gen_file_metadata(html_resource.body) file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource) - if file_meta['mimetype'] not in ("text/html", "text/xml"): + if file_meta["mimetype"] not in ("text/html", "text/xml"): return IngestWebResult( status="wrong-mimetype", hit=False, @@ -379,8 +402,8 @@ def run_single(url: str, html_doc = HTMLParser(html_resource.body) html_biblio = html_extract_biblio(url, html_doc) html_body = html_extract_body_teixml(html_resource.body) - html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('word_count')) - if html_scope not in ('article-fulltext', 'unknown'): + html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get("word_count")) + if html_scope not in ("article-fulltext", "unknown"): return IngestWebResult( status="wrong-scope", hit=False, @@ -397,8 +420,9 @@ def run_single(url: str, full_resources: List[WebResource] = [] if quick_mode: - full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, - when) + full_resources = quick_fetch_html_resources( + raw_resources, wayback_client.cdx_client, when + ) else: full_resources = fetch_html_resources(raw_resources, wayback_client, when) @@ -425,8 +449,9 @@ def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers() - sub = subparsers.add_parser("single", - help="tries to ingest a single URL, dumps result to stdout") + sub = subparsers.add_parser( + "single", help="tries to ingest a single URL, dumps result to stdout" + ) sub.set_defaults(func="run_single") sub.add_argument( "url", @@ -453,8 +478,8 @@ def main() -> None: result = run_single(args.url, args.timestamp, args.quick_mode) print(result.json(indent=2, exclude_none=True)) else: - #func = getattr(wp, args.func) - #func() + # func = getattr(wp, args.func) + # func() raise NotImplementedError() diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py index 1967ba3..d47ab89 100644 --- a/python/sandcrawler/minio.py +++ b/python/sandcrawler/minio.py @@ -6,11 +6,13 @@ import minio class SandcrawlerMinioClient(object): - def __init__(self, - host_url: str, - access_key: str, - secret_key: str, - default_bucket: Optional[str] = None): + def __init__( + self, + host_url: str, + access_key: str, + secret_key: str, + default_bucket: Optional[str] = None, + ): """ host is minio connection string (host:port) access and secret key are as expected @@ -46,13 +48,15 @@ class SandcrawlerMinioClient(object): ) return obj_path - def put_blob(self, - folder: str, - blob: Union[str, bytes], - sha1hex: Optional[str] = None, - extension: str = "", - prefix: str = "", - bucket: Optional[str] = None) -> Tuple[str, str]: + def put_blob( + self, + folder: str, + blob: Union[str, bytes], + sha1hex: Optional[str] = None, + extension: str = "", + prefix: str = "", + bucket: Optional[str] = None, + ) -> Tuple[str, str]: """ blob should be bytes sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated @@ -61,7 +65,7 @@ class SandcrawlerMinioClient(object): filename is SHA1 with an optional file extension. """ if type(blob) == str: - blob = blob.encode('utf-8') + blob = blob.encode("utf-8") assert type(blob) == bytes if not sha1hex: h = hashlib.sha1() @@ -72,13 +76,13 @@ class SandcrawlerMinioClient(object): bucket = self.default_bucket assert bucket content_type = "application/octet-stream" - if extension.endswith('.xml'): + if extension.endswith(".xml"): content_type = "application/xml" - if extension.endswith('.png'): + if extension.endswith(".png"): content_type = "image/png" - elif extension.endswith('.jpg') or extension.endswith('.jpeg'): + elif extension.endswith(".jpg") or extension.endswith(".jpeg"): content_type = "image/jpeg" - elif extension.endswith('.txt'): + elif extension.endswith(".txt"): content_type = "text/plain" self.mc.put_object( bucket, @@ -89,12 +93,14 @@ class SandcrawlerMinioClient(object): ) return (bucket, obj_path) - def get_blob(self, - folder: str, - sha1hex: str, - extension: str = "", - prefix: str = "", - bucket: str = None) -> bytes: + def get_blob( + self, + folder: str, + sha1hex: str, + extension: str = "", + prefix: str = "", + bucket: str = None, + ) -> bytes: """ sha1hex is sha1 of the blob itself diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 1c779ce..db001dd 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -15,7 +15,7 @@ def clean_url(s: str) -> str: s = s.strip() parsed = urlcanon.parse_url(s) if not parsed.port and parsed.colon_before_port: - parsed.colon_before_port = b'' + parsed.colon_before_port = b"" return str(urlcanon.whatwg(parsed)) @@ -23,10 +23,12 @@ def url_fuzzy_equal(left: str, right: str) -> bool: """ TODO: use proper surt library and canonicalization for this check """ - fuzzy_left = '://'.join( - clean_url(left).replace('www.', '').replace(':80/', '/').split('://')[1:]) - fuzzy_right = '://'.join( - clean_url(right).replace('www.', '').replace(':80/', '/').split('://')[1:]) + fuzzy_left = "://".join( + clean_url(left).replace("www.", "").replace(":80/", "/").split("://")[1:] + ) + fuzzy_right = "://".join( + clean_url(right).replace("www.", "").replace(":80/", "/").split("://")[1:] + ) if fuzzy_left == fuzzy_right: return True elif fuzzy_left == fuzzy_right + "/" or fuzzy_right == fuzzy_left + "/": @@ -35,10 +37,13 @@ def url_fuzzy_equal(left: str, right: str) -> bool: def test_url_fuzzy_equal() -> None: - assert url_fuzzy_equal( - "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", - "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree" - ) is True + assert ( + url_fuzzy_equal( + "http://www.annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", + "http://annalsofian.org/article.asp?issn=0972-2327;year=2014;volume=17;issue=4;spage=463;epage=465;aulast=Nithyashree", + ) + is True + ) def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict: @@ -53,7 +58,7 @@ def gen_file_metadata(blob: bytes, allow_empty: bool = False) -> dict: if len(blob) < 1024 * 1024: mimetype = magic.Magic(mime=True).from_buffer(blob) else: - mimetype = magic.Magic(mime=True).from_buffer(blob[:(1024 * 1024)]) + mimetype = magic.Magic(mime=True).from_buffer(blob[: (1024 * 1024)]) if mimetype in ("application/xml", "text/xml"): # crude checks for XHTML or JATS XML, using only first 1 kB of file if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]: @@ -83,10 +88,13 @@ def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict: assert path is not None mimetype = magic.Magic(mime=True).from_file(path) if mimetype in ("application/xml", "text/xml"): - with open(path, 'rb') as f: + with open(path, "rb") as f: blob = f.read(1024) # crude checks for XHTML or JATS XML, using only first 1 kB of file - if b"<htm" in blob[:1024] and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024]: + if ( + b"<htm" in blob[:1024] + and b'xmlns="http://www.w3.org/1999/xhtml"' in blob[:1024] + ): mimetype = "application/xhtml+xml" elif b"<article " in blob[:1024] and b"<html" not in blob[:1024]: mimetype = "application/jats+xml" @@ -96,7 +104,7 @@ def gen_file_metadata_path(path: str, allow_empty: bool = False) -> dict: hashlib.md5(), ] size_bytes = 0 - with open(path, 'rb') as f: + with open(path, "rb") as f: while True: chunk = f.read(1024 * 1024) if not chunk: @@ -128,15 +136,15 @@ def b32_hex(s: str) -> str: if len(s) == 40: return s raise ValueError("not a base-32 encoded SHA-1 hash: {}".format(s)) - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") NORMAL_MIME = ( - 'application/pdf', - 'application/postscript', - 'text/html', - 'text/xml', - 'application/octet-stream', + "application/pdf", + "application/postscript", + "text/html", + "text/xml", + "application/octet-stream", ) @@ -147,22 +155,22 @@ def normalize_mime(raw: str) -> Optional[str]: return norm # Special cases - if raw.startswith('application/xml'): - return 'text/xml' - if raw.startswith('application/x-pdf'): - return 'application/pdf' - if raw in ('.pdf', ): - return 'application/pdf' + if raw.startswith("application/xml"): + return "text/xml" + if raw.startswith("application/x-pdf"): + return "application/pdf" + if raw in (".pdf",): + return "application/pdf" if raw in ( - 'application/download', - 'binary/octet-stream', - 'unk', - 'application/x-download', - 'application/octetstream', - 'application/force-download', - 'application/unknown', + "application/download", + "binary/octet-stream", + "unk", + "application/x-download", + "application/octetstream", + "application/force-download", + "application/unknown", ): - return 'application/octet-stream' + return "application/octet-stream" return None @@ -200,14 +208,19 @@ def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]: offset = cdx[9] warc = cdx[10] - if not (sha1b32.isalnum() and c_size.isdigit() and offset.isdigit() and len(sha1b32) == 32 - and dt.isdigit()): + if not ( + sha1b32.isalnum() + and c_size.isdigit() + and offset.isdigit() + and len(sha1b32) == 32 + and dt.isdigit() + ): return None - if '-' in (surt, dt, url, http_status, sha1b32, c_size, offset, warc): + if "-" in (surt, dt, url, http_status, sha1b32, c_size, offset, warc): return None - if mime is None or mime == '-': + if mime is None or mime == "-": mime = "application/octet-stream" if normalize: @@ -242,16 +255,13 @@ def test_parse_cdx_datetime() -> None: assert parse_cdx_datetime("") is None assert parse_cdx_datetime("asdf") is None assert parse_cdx_datetime("19930203123045") is not None - assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, - month=10, - day=28, - hour=23, - minute=51, - second=3) + assert parse_cdx_datetime("20201028235103") == datetime.datetime( + year=2020, month=10, day=28, hour=23, minute=51, second=3 + ) def datetime_to_cdx(dt: datetime.datetime) -> str: - return '%04d%02d%02d%02d%02d%02d' % ( + return "%04d%02d%02d%02d%02d%02d" % ( dt.year, dt.month, dt.day, @@ -263,13 +273,16 @@ def datetime_to_cdx(dt: datetime.datetime) -> str: def test_datetime_to_cdx() -> None: assert "20201028235103" == datetime_to_cdx( - datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)) + datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3) + ) -def requests_retry_session(retries: int = 10, - backoff_factor: int = 3, - status_forcelist: List[int] = [500, 502, 504], - session: requests.Session = None) -> requests.Session: +def requests_retry_session( + retries: int = 10, + backoff_factor: int = 3, + status_forcelist: List[int] = [500, 502, 504], + session: requests.Session = None, +) -> requests.Session: """ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests """ @@ -282,8 +295,8 @@ def requests_retry_session(retries: int = 10, status_forcelist=status_forcelist, ) adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) + session.mount("http://", adapter) + session.mount("https://", adapter) return session diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 1d306d3..6c18395 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -173,64 +173,69 @@ class PdfExtractResult: Outputs a JSON string as would be published to Kafka text/info topic. """ return { - 'key': self.sha1hex, - 'sha1hex': self.sha1hex, - 'status': self.status, - 'file_meta': self.file_meta, - 'error_msg': self.error_msg, - 'text': self.text, - 'has_page0_thumbnail': self.has_page0_thumbnail, - 'meta_xml': self.meta_xml, - 'pdf_info': self.pdf_info, - 'pdf_extra': self.pdf_extra, - 'source': self.source, + "key": self.sha1hex, + "sha1hex": self.sha1hex, + "status": self.status, + "file_meta": self.file_meta, + "error_msg": self.error_msg, + "text": self.text, + "has_page0_thumbnail": self.has_page0_thumbnail, + "meta_xml": self.meta_xml, + "pdf_info": self.pdf_info, + "pdf_extra": self.pdf_extra, + "source": self.source, } @staticmethod - def from_pdftext_dict(record: Dict[str, Any]) -> 'PdfExtractResult': + def from_pdftext_dict(record: Dict[str, Any]) -> "PdfExtractResult": """ Outputs a JSON string as would be published to Kafka text/info topic. """ - if record['status'] != 'success': + if record["status"] != "success": return PdfExtractResult( - sha1hex=record.get('sha1hex') or record['key'], - status=record['status'], - error_msg=record.get('error_msg'), + sha1hex=record.get("sha1hex") or record["key"], + status=record["status"], + error_msg=record.get("error_msg"), ) else: return PdfExtractResult( - sha1hex=record['sha1hex'], - status=record['status'], - file_meta=record.get('file_meta'), - text=record.get('text'), - has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)), - meta_xml=record.get('meta_xml'), - pdf_info=record.get('pdf_info'), - pdf_extra=record.get('pdf_extra'), + sha1hex=record["sha1hex"], + status=record["status"], + file_meta=record.get("file_meta"), + text=record.get("text"), + has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)), + meta_xml=record.get("meta_xml"), + pdf_info=record.get("pdf_info"), + pdf_extra=record.get("pdf_extra"), ) @staticmethod - def from_pdf_meta_dict(record: Dict[str, Any]) -> 'PdfExtractResult': + def from_pdf_meta_dict(record: Dict[str, Any]) -> "PdfExtractResult": """ Parses what would be returned from postgrest """ - if record['status'] != 'success': + if record["status"] != "success": return PdfExtractResult( - sha1hex=record['sha1hex'], - status=record['status'], - error_msg=(record.get('metadata') or {}).get('error_msg'), + sha1hex=record["sha1hex"], + status=record["status"], + error_msg=(record.get("metadata") or {}).get("error_msg"), ) else: pdf_extra = dict() - for k in ('page_count', 'page0_height', 'page0_width', 'permanent_id', - 'pdf_version'): + for k in ( + "page_count", + "page0_height", + "page0_width", + "permanent_id", + "pdf_version", + ): if record.get(k): pdf_extra[k] = record[k] return PdfExtractResult( - sha1hex=record['sha1hex'], - status=record['status'], - has_page0_thumbnail=bool(record.get('has_page0_thumbnail', False)), - pdf_info=record.get('metadata'), + sha1hex=record["sha1hex"], + status=record["status"], + has_page0_thumbnail=bool(record.get("has_page0_thumbnail", False)), + pdf_info=record.get("metadata"), pdf_extra=pdf_extra, ) @@ -247,11 +252,11 @@ class PdfExtractResult: # TODO: form, encrypted if self.pdf_info: metadata = dict() - for k in ('Title', 'Subject', 'Author', 'Creator', 'Producer', 'doi'): + for k in ("Title", "Subject", "Author", "Creator", "Producer", "doi"): if k in self.pdf_info: metadata[k.lower()] = self.pdf_info[k] - if 'CreationDate' in self.pdf_info: - pdf_created = self.pdf_info['CreationDate'] + if "CreationDate" in self.pdf_info: + pdf_created = self.pdf_info["CreationDate"] metadata_json: Optional[str] = None if metadata: metadata_json = json.dumps(metadata, sort_keys=True) @@ -260,20 +265,20 @@ class PdfExtractResult: datetime.datetime.now(), # updated self.status, self.has_page0_thumbnail, - pdf_extra.get('page_count'), + pdf_extra.get("page_count"), word_count, - pdf_extra.get('page0_height'), - pdf_extra.get('page0_width'), - pdf_extra.get('permanent_id'), + pdf_extra.get("page0_height"), + pdf_extra.get("page0_width"), + pdf_extra.get("permanent_id"), pdf_created, - pdf_extra.get('pdf_version'), + pdf_extra.get("pdf_version"), metadata_json, ) -def process_pdf(blob: bytes, - thumb_size: Tuple[int, int] = (180, 300), - thumb_type: str = "JPEG") -> PdfExtractResult: +def process_pdf( + blob: bytes, thumb_size: Tuple[int, int] = (180, 300), thumb_type: str = "JPEG" +) -> PdfExtractResult: """ A known issue is that output text is in "physical layout" mode, which means columns will be side-by-side. We would prefer a single stream of tokens! @@ -283,11 +288,11 @@ def process_pdf(blob: bytes, didn't seem to work at all (returned empty strings). """ file_meta = gen_file_metadata(blob) - sha1hex = file_meta['sha1hex'] - if file_meta['mimetype'] != 'application/pdf': + sha1hex = file_meta["sha1hex"] + if file_meta["mimetype"] != "application/pdf": return PdfExtractResult( sha1hex=sha1hex, - status='not-pdf', + status="not-pdf", error_msg=f"mimetype is '{file_meta['mimetype']}'", file_meta=file_meta, ) @@ -295,7 +300,7 @@ def process_pdf(blob: bytes, if sha1hex in BAD_PDF_SHA1HEX: return PdfExtractResult( sha1hex=sha1hex, - status='bad-pdf', + status="bad-pdf", error_msg="PDF known to cause processing issues", file_meta=file_meta, ) @@ -306,7 +311,7 @@ def process_pdf(blob: bytes, if pdf is None: return PdfExtractResult( sha1hex=sha1hex, - status='empty-pdf', + status="empty-pdf", file_meta=file_meta, has_page0_thumbnail=False, ) @@ -314,7 +319,7 @@ def process_pdf(blob: bytes, if page0 is None: return PdfExtractResult( sha1hex=sha1hex, - status='empty-page0', + status="empty-page0", file_meta=file_meta, ) # this call sometimes fails an returns an AttributeError @@ -324,7 +329,7 @@ def process_pdf(blob: bytes, # starting with a narrow set return PdfExtractResult( sha1hex=sha1hex, - status='parse-error', + status="parse-error", error_msg=str(e), file_meta=file_meta, ) @@ -334,8 +339,9 @@ def process_pdf(blob: bytes, renderer = poppler.PageRenderer() try: full_img = renderer.render_page(page0) - img = Image.frombuffer("RGBA", (full_img.width, full_img.height), full_img.data, 'raw', - "BGRA", 0, 1) + img = Image.frombuffer( + "RGBA", (full_img.width, full_img.height), full_img.data, "raw", "BGRA", 0, 1 + ) img.thumbnail(thumb_size, Image.BICUBIC) buf = BytesIO() img.save(buf, thumb_type) @@ -355,7 +361,7 @@ def process_pdf(blob: bytes, except AttributeError as e: return PdfExtractResult( sha1hex=sha1hex, - status='parse-error', + status="parse-error", error_msg=str(e), file_meta=file_meta, ) @@ -364,14 +370,14 @@ def process_pdf(blob: bytes, if len(full_text) > 1000000: return PdfExtractResult( sha1hex=sha1hex, - status='text-too-large', + status="text-too-large", error_msg="full_text chars: {}".format(len(full_text)), file_meta=file_meta, ) if len(pdf.metadata) > 1000000: return PdfExtractResult( sha1hex=sha1hex, - status='text-too-large', + status="text-too-large", error_msg="meta_xml chars: {}".format(len(full_text)), file_meta=file_meta, ) @@ -381,7 +387,7 @@ def process_pdf(blob: bytes, except UnicodeDecodeError: return PdfExtractResult( sha1hex=sha1hex, - status='bad-unicode', + status="bad-unicode", error_msg="in infos()", file_meta=file_meta, ) @@ -402,7 +408,7 @@ def process_pdf(blob: bytes, return PdfExtractResult( sha1hex=sha1hex, file_meta=file_meta, - status='success', + status="success", error_msg=None, text=full_text or None, has_page0_thumbnail=page0_thumbnail is not None, @@ -421,17 +427,19 @@ def process_pdf(blob: bytes, class PdfExtractWorker(SandcrawlerFetchWorker): - def __init__(self, - wayback_client: Optional[WaybackClient] = None, - sink: Optional[SandcrawlerWorker] = None, - **kwargs): + def __init__( + self, + wayback_client: Optional[WaybackClient] = None, + sink: Optional[SandcrawlerWorker] = None, + **kwargs, + ): super().__init__(wayback_client=wayback_client) self.wayback_client = wayback_client self.sink = sink - self.thumbnail_sink = kwargs.get('thumbnail_sink') + self.thumbnail_sink = kwargs.get("thumbnail_sink") def timeout_response(self, task: Dict[str, Any]) -> Dict[str, Any]: - default_key = task['sha1hex'] + default_key = task["sha1hex"] return dict( status="error-timeout", error_msg="internal pdf-extract worker timeout", @@ -441,9 +449,9 @@ class PdfExtractWorker(SandcrawlerFetchWorker): def process(self, record: Any, key: Optional[str] = None) -> dict: fetch_result = self.fetch_blob(record) - if fetch_result['status'] != 'success': + if fetch_result["status"] != "success": return fetch_result - blob: bytes = fetch_result['blob'] + blob: bytes = fetch_result["blob"] assert blob and isinstance(blob, bytes) result = process_pdf(blob) @@ -458,10 +466,11 @@ class PdfExtractBlobWorker(SandcrawlerWorker): This is sort of like PdfExtractWorker, except it receives blobs directly, instead of fetching blobs from some remote store. """ + def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs): super().__init__() self.sink = sink - self.thumbnail_sink = kwargs.get('thumbnail_sink') + self.thumbnail_sink = kwargs.get("thumbnail_sink") def process(self, blob: Any, key: Optional[str] = None) -> Any: if not blob: diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 138e65c..d765164 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -32,37 +32,37 @@ class PdfTrioClient(object): pdftrio_response = requests.post( self.host_url + "/classify/research-pub/" + mode, files={ - 'pdf_content': blob, + "pdf_content": blob, }, timeout=60.0, ) except requests.Timeout: return { - 'status': 'error-timeout', - 'status_code': -4, # heritrix3 "HTTP timeout" code - 'error_msg': 'pdftrio request (HTTP POST) timeout', + "status": "error-timeout", + "status_code": -4, # heritrix3 "HTTP timeout" code + "error_msg": "pdftrio request (HTTP POST) timeout", } except requests.exceptions.ConnectionError: # crude back-off time.sleep(2.0) return { - 'status': 'error-connect', - 'status_code': -2, # heritrix3 "HTTP connect" code - 'error_msg': 'pdftrio request connection timout', + "status": "error-connect", + "status_code": -2, # heritrix3 "HTTP connect" code + "error_msg": "pdftrio request connection timout", } info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code) if pdftrio_response.status_code == 200: resp_json = pdftrio_response.json() - assert 'ensemble_score' in resp_json - assert 'status' in resp_json - assert 'versions' in resp_json + assert "ensemble_score" in resp_json + assert "status" in resp_json + assert "versions" in resp_json info.update(resp_json) else: - info['status'] = 'error' + info["status"] = "error" # TODO: might return JSON with some info? - info['_total_sec'] = pdftrio_response.elapsed.total_seconds() + info["_total_sec"] = pdftrio_response.elapsed.total_seconds() return info @@ -70,11 +70,14 @@ class PdfTrioWorker(SandcrawlerFetchWorker): """ This class is basically copied directly from GrobidWorker """ - def __init__(self, - pdftrio_client: PdfTrioClient, - wayback_client: Optional[WaybackClient] = None, - sink: Optional[SandcrawlerWorker] = None, - **kwargs): + + def __init__( + self, + pdftrio_client: PdfTrioClient, + wayback_client: Optional[WaybackClient] = None, + sink: Optional[SandcrawlerWorker] = None, + **kwargs + ): super().__init__(wayback_client=wayback_client, **kwargs) self.pdftrio_client = pdftrio_client self.sink = sink @@ -86,22 +89,22 @@ class PdfTrioWorker(SandcrawlerFetchWorker): start = time.time() fetch_result = self.fetch_blob(record) fetch_sec = time.time() - start - if fetch_result['status'] != 'success': + if fetch_result["status"] != "success": return fetch_result - blob: bytes = fetch_result['blob'] + blob: bytes = fetch_result["blob"] assert blob and isinstance(blob, bytes) result = dict() - result['file_meta'] = gen_file_metadata(blob) - result['key'] = result['file_meta']['sha1hex'] - result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob) - result['source'] = record - result['timing'] = dict( - pdftrio_sec=result['pdf_trio'].pop('_total_sec', None), + result["file_meta"] = gen_file_metadata(blob) + result["key"] = result["file_meta"]["sha1hex"] + result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob) + result["source"] = record + result["timing"] = dict( + pdftrio_sec=result["pdf_trio"].pop("_total_sec", None), total_sec=time.time() - start_process, ) if fetch_sec: - result['timing']['fetch_sec'] = fetch_sec + result["timing"]["fetch_sec"] = fetch_sec return result @@ -110,11 +113,14 @@ class PdfTrioBlobWorker(SandcrawlerWorker): This is sort of like PdfTrioWorker, except it receives blobs directly, instead of fetching blobs from some remote store. """ - def __init__(self, - pdftrio_client: PdfTrioClient, - sink: Optional[SandcrawlerWorker] = None, - mode: str = "auto", - **kwargs): + + def __init__( + self, + pdftrio_client: PdfTrioClient, + sink: Optional[SandcrawlerWorker] = None, + mode: str = "auto", + **kwargs + ): super().__init__(**kwargs) self.pdftrio_client = pdftrio_client self.sink = sink @@ -126,11 +132,11 @@ class PdfTrioBlobWorker(SandcrawlerWorker): return None assert isinstance(blob, bytes) result = dict() - result['file_meta'] = gen_file_metadata(blob) - result['key'] = result['file_meta']['sha1hex'] - result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode) - result['timing'] = dict( - pdftrio_sec=result['pdf_trio'].pop('_total_sec', None), + result["file_meta"] = gen_file_metadata(blob) + result["key"] = result["file_meta"]["sha1hex"] + result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob, mode=self.mode) + result["timing"] = dict( + pdftrio_sec=result["pdf_trio"].pop("_total_sec", None), total_sec=time.time() - start_process, ) return result diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 8ec5979..f7954b1 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -41,14 +41,14 @@ class PersistCdxWorker(SandcrawlerWorker): raise NotImplementedError def push_batch(self, batch: list) -> list: - self.counts['total'] += len(batch) + self.counts["total"] += len(batch) # filter to full CDX lines, no liveweb - cdx_batch = [r for r in batch if r.get('warc_path') and ("/" in r['warc_path'])] + cdx_batch = [r for r in batch if r.get("warc_path") and ("/" in r["warc_path"])] resp = self.db.insert_cdx(self.cur, cdx_batch) if len(cdx_batch) < len(batch): - self.counts['skip'] += len(batch) - len(cdx_batch) - self.counts['insert-cdx'] += resp[0] - self.counts['update-cdx'] += resp[1] + self.counts["skip"] += len(batch) - len(cdx_batch) + self.counts["insert-cdx"] += resp[0] + self.counts["update-cdx"] += resp[1] self.db.commit() return [] @@ -70,47 +70,52 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if there is a problem with conversion, return None """ # backwards compat hacks; transform request to look like current schema - if raw.get('ingest_type') == 'file': - raw['ingest_type'] = 'pdf' - if (not raw.get('link_source') and raw.get('base_url') - and raw.get('ext_ids', {}).get('doi') - and raw['base_url'] == "https://doi.org/{}".format(raw['ext_ids']['doi'])): + if raw.get("ingest_type") == "file": + raw["ingest_type"] = "pdf" + if ( + not raw.get("link_source") + and raw.get("base_url") + and raw.get("ext_ids", {}).get("doi") + and raw["base_url"] == "https://doi.org/{}".format(raw["ext_ids"]["doi"]) + ): # set link_source(_id) for old ingest requests - raw['link_source'] = 'doi' - raw['link_source_id'] = raw['ext_ids']['doi'] - if (not raw.get('link_source') - and raw.get('ingest_request_source', '').startswith('savepapernow') - and raw.get('fatcat', {}).get('release_ident')): + raw["link_source"] = "doi" + raw["link_source_id"] = raw["ext_ids"]["doi"] + if ( + not raw.get("link_source") + and raw.get("ingest_request_source", "").startswith("savepapernow") + and raw.get("fatcat", {}).get("release_ident") + ): # set link_source(_id) for old ingest requests - raw['link_source'] = 'spn' - raw['link_source_id'] = raw['fatcat']['release_ident'] + raw["link_source"] = "spn" + raw["link_source_id"] = raw["fatcat"]["release_ident"] - for k in ('ingest_type', 'base_url', 'link_source', 'link_source_id'): + for k in ("ingest_type", "base_url", "link_source", "link_source_id"): if k not in raw: - self.counts['skip-request-fields'] += 1 + self.counts["skip-request-fields"] += 1 return None - if raw['ingest_type'] not in ('pdf', 'xml', 'html'): - self.counts['skip-ingest-type'] += 1 + if raw["ingest_type"] not in ("pdf", "xml", "html"): + self.counts["skip-ingest-type"] += 1 return None request = { - 'ingest_type': raw['ingest_type'], - 'base_url': raw['base_url'], - 'link_source': raw['link_source'], - 'link_source_id': raw['link_source_id'], - 'ingest_request_source': raw.get('ingest_request_source'), - 'request': {}, + "ingest_type": raw["ingest_type"], + "base_url": raw["base_url"], + "link_source": raw["link_source"], + "link_source_id": raw["link_source_id"], + "ingest_request_source": raw.get("ingest_request_source"), + "request": {}, } # extra/optional fields - if raw.get('release_stage'): - request['release_stage'] = raw['release_stage'] - if raw.get('fatcat', {}).get('release_ident'): - request['request']['release_ident'] = raw['fatcat']['release_ident'] - for k in ('ext_ids', 'edit_extra', 'rel'): + if raw.get("release_stage"): + request["release_stage"] = raw["release_stage"] + if raw.get("fatcat", {}).get("release_ident"): + request["request"]["release_ident"] = raw["fatcat"]["release_ident"] + for k in ("ext_ids", "edit_extra", "rel"): if raw.get(k): - request['request'][k] = raw[k] + request["request"][k] = raw[k] # if this dict is empty, trim it to save DB space - if not request['request']: - request['request'] = None + if not request["request"]: + request["request"] = None return request def file_result_to_row(self, raw: dict) -> Optional[dict]: @@ -119,59 +124,68 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if there is a problem with conversion, return None and set skip count """ - for k in ('request', 'hit', 'status'): + for k in ("request", "hit", "status"): if k not in raw: - self.counts['skip-result-fields'] += 1 + self.counts["skip-result-fields"] += 1 return None - if 'base_url' not in raw['request']: - self.counts['skip-result-fields'] += 1 + if "base_url" not in raw["request"]: + self.counts["skip-result-fields"] += 1 return None - ingest_type = raw['request'].get('ingest_type') - if ingest_type == 'file': - ingest_type = 'pdf' - if ingest_type not in ('pdf', 'xml', 'html', 'component', 'src', 'dataset', - 'dataset-file'): - self.counts['skip-ingest-type'] += 1 + ingest_type = raw["request"].get("ingest_type") + if ingest_type == "file": + ingest_type = "pdf" + if ingest_type not in ( + "pdf", + "xml", + "html", + "component", + "src", + "dataset", + "dataset-file", + ): + self.counts["skip-ingest-type"] += 1 return None - if raw['status'] in ("existing", ): - self.counts['skip-existing'] += 1 + if raw["status"] in ("existing",): + self.counts["skip-existing"] += 1 return None result = { - 'ingest_type': ingest_type, - 'base_url': raw['request']['base_url'], - 'hit': raw['hit'], - 'status': raw['status'], + "ingest_type": ingest_type, + "base_url": raw["request"]["base_url"], + "hit": raw["hit"], + "status": raw["status"], } - terminal = raw.get('terminal') + terminal = raw.get("terminal") if terminal: - result['terminal_url'] = terminal.get('terminal_url') or terminal.get('url') - result['terminal_dt'] = terminal.get('terminal_dt') - result['terminal_status_code'] = terminal.get( - 'terminal_status_code') or terminal.get('status_code') or terminal.get( - 'http_code') - if result['terminal_status_code']: - result['terminal_status_code'] = int(result['terminal_status_code']) - result['terminal_sha1hex'] = terminal.get('terminal_sha1hex') - if len(result['terminal_url']) > 2048: + result["terminal_url"] = terminal.get("terminal_url") or terminal.get("url") + result["terminal_dt"] = terminal.get("terminal_dt") + result["terminal_status_code"] = ( + terminal.get("terminal_status_code") + or terminal.get("status_code") + or terminal.get("http_code") + ) + if result["terminal_status_code"]: + result["terminal_status_code"] = int(result["terminal_status_code"]) + result["terminal_sha1hex"] = terminal.get("terminal_sha1hex") + if len(result["terminal_url"]) > 2048: # postgresql13 doesn't like extremely large URLs in b-tree index - self.counts['skip-huge-url'] += 1 + self.counts["skip-huge-url"] += 1 return None return result def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]: - html_body = record.get('html_body') - file_meta = record.get('file_meta') + html_body = record.get("html_body") + file_meta = record.get("file_meta") if not (file_meta and html_body): return None return HtmlMetaRow( sha1hex=file_meta["sha1hex"], - status=record.get('status'), - scope=record.get('scope'), - has_teixml=bool(html_body and html_body['status'] == 'success'), + status=record.get("status"), + scope=record.get("scope"), + has_teixml=bool(html_body and html_body["status"] == "success"), has_thumbnail=False, # TODO - word_count=(html_body and html_body.get('word_count')) or None, - biblio=record.get('html_biblio'), - resources=record.get('html_resources'), + word_count=(html_body and html_body.get("word_count")) or None, + biblio=record.get("html_biblio"), + resources=record.get("html_resources"), ) def result_to_platform_row(self, raw: dict) -> Optional[dict]: @@ -180,46 +194,49 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if there is a problem with conversion, return None and set skip count """ - for k in ('request', 'hit', 'status'): + for k in ("request", "hit", "status"): if k not in raw: return None - if 'base_url' not in raw['request']: + if "base_url" not in raw["request"]: return None - ingest_type = raw['request'].get('ingest_type') - if ingest_type not in ('dataset'): + ingest_type = raw["request"].get("ingest_type") + if ingest_type not in ("dataset"): return None - if raw['status'] in ("existing", ): + if raw["status"] in ("existing",): return None - if not raw.get('platform_name'): + if not raw.get("platform_name"): return None result = { - 'ingest_type': ingest_type, - 'base_url': raw['request']['base_url'], - 'hit': raw['hit'], - 'status': raw['status'], - 'platform_name': raw.get('platform_name'), - 'platform_domain': raw.get('platform_domain'), - 'platform_id': raw.get('platform_id'), - 'ingest_strategy': raw.get('ingest_strategy'), - 'total_size': raw.get('total_size'), - 'file_count': raw.get('file_count'), - 'archiveorg_item_name': raw.get('archiveorg_item_name'), - 'archiveorg_item_bundle_path': None, - 'web_bundle_url': None, - 'web_bundle_dt': None, - 'manifest': raw.get('manifest'), + "ingest_type": ingest_type, + "base_url": raw["request"]["base_url"], + "hit": raw["hit"], + "status": raw["status"], + "platform_name": raw.get("platform_name"), + "platform_domain": raw.get("platform_domain"), + "platform_id": raw.get("platform_id"), + "ingest_strategy": raw.get("ingest_strategy"), + "total_size": raw.get("total_size"), + "file_count": raw.get("file_count"), + "archiveorg_item_name": raw.get("archiveorg_item_name"), + "archiveorg_item_bundle_path": None, + "web_bundle_url": None, + "web_bundle_dt": None, + "manifest": raw.get("manifest"), } - if result.get('fileset_bundle'): - result['archiveorg_item_bundle_path'] = result['fileset_bundle'].get( - 'archiveorg_item_bundle_path') - result['web_bundle_url'] = result['fileset_bundle'].get('terminal', - {}).get('terminal_url') - result['web_bundle_dt'] = result['fileset_bundle'].get('terminal', - {}).get('terminal_dt') + if result.get("fileset_bundle"): + result["archiveorg_item_bundle_path"] = result["fileset_bundle"].get( + "archiveorg_item_bundle_path" + ) + result["web_bundle_url"] = ( + result["fileset_bundle"].get("terminal", {}).get("terminal_url") + ) + result["web_bundle_dt"] = ( + result["fileset_bundle"].get("terminal", {}).get("terminal_dt") + ) return result def push_batch(self, batch: List[Any]) -> List[Any]: - self.counts['total'] += len(batch) + self.counts["total"] += len(batch) if not batch: return [] @@ -228,60 +245,62 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): results = [r for r in results_unfiltered if r] irequests_unfiltered = [ - self.request_to_row(raw['request']) for raw in batch if raw.get('request') + self.request_to_row(raw["request"]) for raw in batch if raw.get("request") ] irequests = [ - r for r in irequests_unfiltered if r and r['ingest_type'] != 'dataset-file' + r for r in irequests_unfiltered if r and r["ingest_type"] != "dataset-file" ] if irequests: resp = self.db.insert_ingest_request(self.cur, irequests) - self.counts['insert-requests'] += resp[0] - self.counts['update-requests'] += resp[1] + self.counts["insert-requests"] += resp[0] + self.counts["update-requests"] += resp[1] if results: resp = self.db.insert_ingest_file_result(self.cur, results, on_conflict="update") - self.counts['insert-results'] += resp[0] - self.counts['update-results'] += resp[1] + self.counts["insert-results"] += resp[0] + self.counts["update-results"] += resp[1] # these schemas match, so can just pass through - cdx_batch = [r['cdx'] for r in batch if r.get('hit') and r.get('cdx')] + cdx_batch = [r["cdx"] for r in batch if r.get("hit") and r.get("cdx")] revisit_cdx_batch = [ - r['revisit_cdx'] for r in batch if r.get('hit') and r.get('revisit_cdx') + r["revisit_cdx"] for r in batch if r.get("hit") and r.get("revisit_cdx") ] cdx_batch.extend(revisit_cdx_batch) # filter to full CDX lines, with full warc_paths (not liveweb) - cdx_batch = [r for r in cdx_batch if r.get('warc_path') and ("/" in r['warc_path'])] + cdx_batch = [r for r in cdx_batch if r.get("warc_path") and ("/" in r["warc_path"])] if cdx_batch: resp = self.db.insert_cdx(self.cur, cdx_batch) - self.counts['insert-cdx'] += resp[0] - self.counts['update-cdx'] += resp[1] + self.counts["insert-cdx"] += resp[0] + self.counts["update-cdx"] += resp[1] - file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')] + file_meta_batch = [r["file_meta"] for r in batch if r.get("hit") and r.get("file_meta")] if file_meta_batch: resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="nothing") - self.counts['insert-file_meta'] += resp[0] - self.counts['update-file_meta'] += resp[1] + self.counts["insert-file_meta"] += resp[0] + self.counts["update-file_meta"] += resp[1] html_meta_batch = [ - self.result_to_html_meta(r) for r in batch if r.get('hit') and r.get('html_body') + self.result_to_html_meta(r) for r in batch if r.get("hit") and r.get("html_body") ] if html_meta_batch: rows = [d.to_sql_tuple() for d in html_meta_batch if d] resp = self.db.insert_html_meta(self.cur, rows, on_conflict="update") - self.counts['insert-html_meta'] += resp[0] - self.counts['update-html_meta'] += resp[1] + self.counts["insert-html_meta"] += resp[0] + self.counts["update-html_meta"] += resp[1] fileset_platform_batch_all = [ - self.result_to_platform_row(raw) for raw in batch if - raw.get('request', {}).get('ingest_type') == 'dataset' and raw.get('platform_name') + self.result_to_platform_row(raw) + for raw in batch + if raw.get("request", {}).get("ingest_type") == "dataset" + and raw.get("platform_name") ] fileset_platform_batch: List[Dict] = [p for p in fileset_platform_batch_all if p] if fileset_platform_batch: - resp = self.db.insert_ingest_fileset_platform(self.cur, - fileset_platform_batch, - on_conflict="update") - self.counts['insert-fileset_platform'] += resp[0] - self.counts['update-fileset_platform'] += resp[1] + resp = self.db.insert_ingest_fileset_platform( + self.cur, fileset_platform_batch, on_conflict="update" + ) + self.counts["insert-fileset_platform"] += resp[0] + self.counts["update-fileset_platform"] += resp[1] self.db.commit() return [] @@ -307,7 +326,7 @@ class PersistIngestRequestWorker(PersistIngestFileResultWorker): raise NotImplementedError def push_batch(self, batch: list) -> list: - self.counts['total'] += len(batch) + self.counts["total"] += len(batch) if not batch: return [] @@ -317,8 +336,8 @@ class PersistIngestRequestWorker(PersistIngestFileResultWorker): if irequests: resp = self.db.insert_ingest_request(self.cur, irequests) - self.counts['insert-requests'] += resp[0] - self.counts['update-requests'] += resp[1] + self.counts["insert-requests"] += resp[0] + self.counts["update-requests"] += resp[1] self.db.commit() return [] @@ -329,13 +348,13 @@ class PersistGrobidWorker(SandcrawlerWorker): super().__init__() self.grobid = GrobidClient() self.s3 = SandcrawlerMinioClient( - host_url=kwargs.get('s3_url', 'localhost:9000'), - access_key=kwargs['s3_access_key'], - secret_key=kwargs['s3_secret_key'], - default_bucket=kwargs['s3_bucket'], + host_url=kwargs.get("s3_url", "localhost:9000"), + access_key=kwargs["s3_access_key"], + secret_key=kwargs["s3_secret_key"], + default_bucket=kwargs["s3_bucket"], ) - self.s3_only = kwargs.get('s3_only', False) - self.db_only = kwargs.get('db_only', False) + self.s3_only = kwargs.get("s3_only", False) + self.db_only = kwargs.get("db_only", False) assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed" if not self.s3_only: self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url) @@ -349,58 +368,58 @@ class PersistGrobidWorker(SandcrawlerWorker): raise NotImplementedError def push_batch(self, batch: list) -> list: - self.counts['total'] += len(batch) + self.counts["total"] += len(batch) # filter out bad "missing status_code" timeout rows - missing = [r for r in batch if not r.get('status_code')] + missing = [r for r in batch if not r.get("status_code")] if missing: - self.counts['skip-missing-status'] += len(missing) - batch = [r for r in batch if r.get('status_code')] + self.counts["skip-missing-status"] += len(missing) + batch = [r for r in batch if r.get("status_code")] for r in batch: - if r['status_code'] != 200 or not r.get('tei_xml'): - self.counts['s3-skip-status'] += 1 - if r.get('error_msg'): - r['metadata'] = {'error_msg': r['error_msg'][:500]} + if r["status_code"] != 200 or not r.get("tei_xml"): + self.counts["s3-skip-status"] += 1 + if r.get("error_msg"): + r["metadata"] = {"error_msg": r["error_msg"][:500]} continue - assert len(r['key']) == 40 + assert len(r["key"]) == 40 if not self.db_only: self.s3.put_blob( folder="grobid", - blob=r['tei_xml'], - sha1hex=r['key'], + blob=r["tei_xml"], + sha1hex=r["key"], extension=".tei.xml", ) - self.counts['s3-put'] += 1 + self.counts["s3-put"] += 1 # enhance with teixml2json metadata, if available try: metadata = self.grobid.metadata(r) except xml.etree.ElementTree.ParseError as xml_e: - r['status'] = 'bad-grobid-xml' - r['metadata'] = {'error_msg': str(xml_e)[:1024]} + r["status"] = "bad-grobid-xml" + r["metadata"] = {"error_msg": str(xml_e)[:1024]} continue if not metadata: continue - for k in ('fatcat_release', 'grobid_version'): + for k in ("fatcat_release", "grobid_version"): r[k] = metadata.pop(k, None) - if r.get('fatcat_release'): - r['fatcat_release'] = r['fatcat_release'].replace('release_', '') - if metadata.get('grobid_timestamp'): - r['updated'] = metadata['grobid_timestamp'] - r['metadata'] = metadata + if r.get("fatcat_release"): + r["fatcat_release"] = r["fatcat_release"].replace("release_", "") + if metadata.get("grobid_timestamp"): + r["updated"] = metadata["grobid_timestamp"] + r["metadata"] = metadata if not self.s3_only: assert self.db and self.cur resp = self.db.insert_grobid(self.cur, batch, on_conflict="update") - self.counts['insert-grobid'] += resp[0] - self.counts['update-grobid'] += resp[1] + self.counts["insert-grobid"] += resp[0] + self.counts["update-grobid"] += resp[1] - file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')] + file_meta_batch = [r["file_meta"] for r in batch if r.get("file_meta")] resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update") - self.counts['insert-file-meta'] += resp[0] - self.counts['update-file-meta'] += resp[1] + self.counts["insert-file-meta"] += resp[0] + self.counts["update-file-meta"] += resp[1] self.db.commit() @@ -413,6 +432,7 @@ class PersistGrobidDiskWorker(SandcrawlerWorker): This could be refactored into a "Sink" type with an even thinner wrapper. """ + def __init__(self, output_dir: str): super().__init__() self.output_dir = output_dir @@ -428,14 +448,14 @@ class PersistGrobidDiskWorker(SandcrawlerWorker): def process(self, record: Any, key: Optional[str] = None) -> Any: - if record.get('status_code') != 200 or not record.get('tei_xml'): + if record.get("status_code") != 200 or not record.get("tei_xml"): return False - assert (len(record['key'])) == 40 - p = "{}/{}".format(self.output_dir, self._blob_path(record['key'])) + assert (len(record["key"])) == 40 + p = "{}/{}".format(self.output_dir, self._blob_path(record["key"])) os.makedirs(os.path.dirname(p), exist_ok=True) - with open(p, 'w') as f: - f.write(record.pop('tei_xml')) - self.counts['written'] += 1 + with open(p, "w") as f: + f.write(record.pop("tei_xml")) + self.counts["written"] += 1 return record @@ -450,24 +470,25 @@ class PersistPdfTrioWorker(SandcrawlerWorker): raise NotImplementedError def push_batch(self, batch: list) -> list: - self.counts['total'] += len(batch) + self.counts["total"] += len(batch) - batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')] + batch = [r for r in batch if "pdf_trio" in r and r["pdf_trio"].get("status_code")] for r in batch: # copy key (sha1hex) into sub-object - r['pdf_trio']['key'] = r['key'] - pdftrio_batch = [r['pdf_trio'] for r in batch] + r["pdf_trio"]["key"] = r["key"] + pdftrio_batch = [r["pdf_trio"] for r in batch] resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update") - self.counts['insert-pdftrio'] += resp[0] - self.counts['update-pdftrio'] += resp[1] + self.counts["insert-pdftrio"] += resp[0] + self.counts["update-pdftrio"] += resp[1] file_meta_batch = [ - r['file_meta'] for r in batch - if r['pdf_trio']['status'] == "success" and r.get('file_meta') + r["file_meta"] + for r in batch + if r["pdf_trio"]["status"] == "success" and r.get("file_meta") ] resp = self.db.insert_file_meta(self.cur, file_meta_batch) - self.counts['insert-file-meta'] += resp[0] - self.counts['update-file-meta'] += resp[1] + self.counts["insert-file-meta"] += resp[0] + self.counts["update-file-meta"] += resp[1] self.db.commit() return [] @@ -479,16 +500,17 @@ class PersistPdfTextWorker(SandcrawlerWorker): Should keep batch sizes small. """ + def __init__(self, db_url: str, **kwargs): super().__init__() self.s3 = SandcrawlerMinioClient( - host_url=kwargs.get('s3_url', 'localhost:9000'), - access_key=kwargs['s3_access_key'], - secret_key=kwargs['s3_secret_key'], - default_bucket=kwargs['s3_bucket'], + host_url=kwargs.get("s3_url", "localhost:9000"), + access_key=kwargs["s3_access_key"], + secret_key=kwargs["s3_secret_key"], + default_bucket=kwargs["s3_bucket"], ) - self.s3_only = kwargs.get('s3_only', False) - self.db_only = kwargs.get('db_only', False) + self.s3_only = kwargs.get("s3_only", False) + self.db_only = kwargs.get("db_only", False) assert not (self.s3_only and self.db_only), "Only one of s3_only and db_only allowed" if not self.s3_only: self.db: Optional[SandcrawlerPostgresClient] = SandcrawlerPostgresClient(db_url) @@ -502,17 +524,17 @@ class PersistPdfTextWorker(SandcrawlerWorker): raise NotImplementedError def push_batch(self, batch: list) -> list: - self.counts['total'] += len(batch) + self.counts["total"] += len(batch) parsed_batch = [] for r in batch: parsed_batch.append(PdfExtractResult.from_pdftext_dict(r)) for r in parsed_batch: - if r.status != 'success' or not r.text: - self.counts['s3-skip-status'] += 1 + if r.status != "success" or not r.text: + self.counts["s3-skip-status"] += 1 if r.error_msg: - r.metadata = {'error_msg': r.error_msg[:500]} + r.metadata = {"error_msg": r.error_msg[:500]} continue assert len(r.sha1hex) == 40 @@ -523,19 +545,19 @@ class PersistPdfTextWorker(SandcrawlerWorker): sha1hex=r.sha1hex, extension=".txt", ) - self.counts['s3-put'] += 1 + self.counts["s3-put"] += 1 if not self.s3_only: assert self.db and self.cur rows = [r.to_sql_tuple() for r in parsed_batch] resp = self.db.insert_pdf_meta(self.cur, rows, on_conflict="update") - self.counts['insert-pdf-meta'] += resp[0] - self.counts['update-pdf-meta'] += resp[1] + self.counts["insert-pdf-meta"] += resp[0] + self.counts["update-pdf-meta"] += resp[1] file_meta_batch = [r.file_meta for r in parsed_batch if r.file_meta] resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update") - self.counts['insert-file-meta'] += resp[0] - self.counts['update-file-meta'] += resp[1] + self.counts["insert-file-meta"] += resp[0] + self.counts["update-file-meta"] += resp[1] self.db.commit() @@ -550,16 +572,17 @@ class PersistThumbnailWorker(SandcrawlerWorker): This worker *must* be used with raw kakfa mode; thumbnails are *not* wrapped in JSON like most sandcrawler kafka messages. """ + def __init__(self, **kwargs): super().__init__() self.s3 = SandcrawlerMinioClient( - host_url=kwargs.get('s3_url', 'localhost:9000'), - access_key=kwargs['s3_access_key'], - secret_key=kwargs['s3_secret_key'], - default_bucket=kwargs['s3_bucket'], + host_url=kwargs.get("s3_url", "localhost:9000"), + access_key=kwargs["s3_access_key"], + secret_key=kwargs["s3_secret_key"], + default_bucket=kwargs["s3_bucket"], ) - self.s3_extension = kwargs.get('s3_extension', ".jpg") - self.s3_folder = kwargs.get('s3_folder', "pdf") + self.s3_extension = kwargs.get("s3_extension", ".jpg") + self.s3_folder = kwargs.get("s3_folder", "pdf") def process(self, record: Any, key: Optional[str] = None) -> Any: """ @@ -569,7 +592,7 @@ class PersistThumbnailWorker(SandcrawlerWorker): assert isinstance(record, bytes) blob: bytes = record if isinstance(key, bytes): - key = key.decode('utf-8') + key = key.decode("utf-8") assert key is not None and len(key) == 40 and isinstance(key, str) assert len(blob) >= 50 @@ -579,7 +602,7 @@ class PersistThumbnailWorker(SandcrawlerWorker): sha1hex=key, extension=self.s3_extension, ) - self.counts['s3-put'] += 1 + self.counts["s3-put"] += 1 class GenericPersistDocWorker(SandcrawlerWorker): @@ -588,39 +611,40 @@ class GenericPersistDocWorker(SandcrawlerWorker): Objects are assumed to be JSON-wrapped strings. """ + def __init__(self, **kwargs): super().__init__() self.s3 = SandcrawlerMinioClient( - host_url=kwargs.get('s3_url', 'localhost:9000'), - access_key=kwargs['s3_access_key'], - secret_key=kwargs['s3_secret_key'], - default_bucket=kwargs['s3_bucket'], + host_url=kwargs.get("s3_url", "localhost:9000"), + access_key=kwargs["s3_access_key"], + secret_key=kwargs["s3_secret_key"], + default_bucket=kwargs["s3_bucket"], ) - self.s3_extension = kwargs.get('s3_extension', ".unknown") - self.s3_folder = kwargs.get('s3_folder', "unknown") + self.s3_extension = kwargs.get("s3_extension", ".unknown") + self.s3_folder = kwargs.get("s3_folder", "unknown") self.doc_key = "unknown" def process(self, record: Any, key: Optional[str] = None) -> Any: - if record.get('status') != 'success' or not record.get(self.doc_key): + if record.get("status") != "success" or not record.get(self.doc_key): return assert key is not None if isinstance(key, bytes): - key_str = key.decode('utf-8') + key_str = key.decode("utf-8") elif isinstance(key, str): key_str = key assert len(key_str) == 40 - if 'sha1hex' in record: - assert key_str == record['sha1hex'] + if "sha1hex" in record: + assert key_str == record["sha1hex"] self.s3.put_blob( folder=self.s3_folder, - blob=record[self.doc_key].encode('utf-8'), + blob=record[self.doc_key].encode("utf-8"), sha1hex=key_str, extension=self.s3_extension, ) - self.counts['s3-put'] += 1 + self.counts["s3-put"] += 1 class PersistXmlDocWorker(GenericPersistDocWorker): @@ -628,10 +652,11 @@ class PersistXmlDocWorker(GenericPersistDocWorker): Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to sandcrawler database (SQL). """ + def __init__(self, **kwargs): super().__init__(**kwargs) - self.s3_extension = kwargs.get('s3_extension', ".jats.xml") - self.s3_folder = kwargs.get('s3_folder', "xml_doc") + self.s3_extension = kwargs.get("s3_extension", ".jats.xml") + self.s3_folder = kwargs.get("s3_folder", "xml_doc") self.doc_key = "jats_xml" @@ -640,8 +665,9 @@ class PersistHtmlTeiXmlWorker(GenericPersistDocWorker): Pushes TEI-XML file to blob store (S3/seaweed/minio). Does not talk to sandcrawler database (SQL). """ + def __init__(self, **kwargs): super().__init__(**kwargs) - self.s3_extension = kwargs.get('s3_extension', ".tei.xml") - self.s3_folder = kwargs.get('s3_folder', "html_body") + self.s3_extension = kwargs.get("s3_extension", ".tei.xml") + self.s3_folder = kwargs.get("s3_folder", "html_body") self.doc_key = "tei_xml" diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index ceb6671..bd7f36a 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -10,8 +10,13 @@ from typing import Any, Dict, List, Optional, Sequence import requests from confluent_kafka import Consumer, KafkaException, Producer -from .ia import (PetaboxError, SandcrawlerBackoffError, WaybackClient, WaybackContentError, - WaybackError) +from .ia import ( + PetaboxError, + SandcrawlerBackoffError, + WaybackClient, + WaybackContentError, + WaybackError, +) from .misc import parse_cdx_line @@ -22,25 +27,26 @@ class SandcrawlerWorker(object): Usually these get "pushed" into by a RecordPusher. Output goes to another worker (pipeline-style), or defaults to stdout. """ - def __init__(self, sink: Optional['SandcrawlerWorker'] = None): + + def __init__(self, sink: Optional["SandcrawlerWorker"] = None): self.counts: Counter = Counter() self.sink: Optional[SandcrawlerWorker] = sink def push_record(self, task: Any, key: Optional[str] = None) -> Any: - self.counts['total'] += 1 + self.counts["total"] += 1 if not self.want(task): - self.counts['skip'] += 1 + self.counts["skip"] += 1 return result = self.process(task, key=key) if not result: - self.counts['failed'] += 1 + self.counts["failed"] += 1 return - elif type(result) == dict and 'status' in result and len(result['status']) < 32: - self.counts[result['status']] += 1 + elif type(result) == dict and "status" in result and len(result["status"]) < 32: + self.counts[result["status"]] += 1 if self.sink: self.sink.push_record(result) - self.counts['pushed'] += 1 + self.counts["pushed"] += 1 else: print(json.dumps(result)) return result @@ -53,10 +59,9 @@ class SandcrawlerWorker(object): """ return None - def push_record_timeout(self, - task: Any, - key: Optional[str] = None, - timeout: int = 300) -> Any: + def push_record_timeout( + self, task: Any, key: Optional[str] = None, timeout: int = 300 + ) -> Any: """ A wrapper around self.push_record which sets a timeout. @@ -64,6 +69,7 @@ class SandcrawlerWorker(object): multithreading or if signal-based timeouts are used elsewhere in the same process. """ + def timeout_handler(signum: int, frame: Any) -> None: raise TimeoutError("timeout processing record") @@ -73,12 +79,12 @@ class SandcrawlerWorker(object): try: resp = self.push_record(task, key=key) except TimeoutError: - self.counts['timeout'] += 1 + self.counts["timeout"] += 1 resp = self.timeout_response(task) # pylint: disable=assignment-from-none # TODO: what if it is this push_record() itself that is timing out? if resp and self.sink: self.sink.push_record(resp) - self.counts['pushed'] += 1 + self.counts["pushed"] += 1 elif resp: print(json.dumps(resp)) finally: @@ -109,7 +115,7 @@ class SandcrawlerWorker(object): TODO: should derived workers explicitly type-check the 'task' object? """ - raise NotImplementedError('implementation required') + raise NotImplementedError("implementation required") class SandcrawlerFetchWorker(SandcrawlerWorker): @@ -117,25 +123,26 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): Wrapper of SandcrawlerWorker that adds a helper method to fetch blobs (eg, PDFs) from wayback, archive.org, or other sources. """ + def __init__(self, wayback_client: Optional[WaybackClient], **kwargs): super().__init__(**kwargs) self.wayback_client = wayback_client def fetch_blob(self, record: Dict[str, Any]) -> Dict[str, Any]: - default_key = record['sha1hex'] + default_key = record["sha1hex"] wayback_sec = None petabox_sec = None - if record.get('warc_path') and record.get('warc_offset'): + if record.get("warc_path") and record.get("warc_offset"): # it's a full CDX dict. fetch using WaybackClient if not self.wayback_client: raise Exception("wayback client not configured for this SandcrawlerFetchWorker") try: start = time.time() blob: bytes = self.wayback_client.fetch_petabox_body( - csize=record['warc_csize'], - offset=record['warc_offset'], - warc_path=record['warc_path'], + csize=record["warc_csize"], + offset=record["warc_offset"], + warc_path=record["warc_path"], ) wayback_sec = time.time() - start except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we: @@ -145,15 +152,15 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): status="error-wayback", error_msg=str(we), ) - elif record.get('url') and record.get('datetime'): + elif record.get("url") and record.get("datetime"): # it's a partial CDX dict or something? fetch using WaybackClient if not self.wayback_client: raise Exception("wayback client not configured for this SandcrawlerFetchWorker") try: start = time.time() blob = self.wayback_client.fetch_replay_body( - url=record['url'], - datetime=record['datetime'], + url=record["url"], + datetime=record["datetime"], ) wayback_sec = time.time() - start except (WaybackError, WaybackContentError) as we: @@ -163,11 +170,12 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): status="error-wayback", error_msg=str(we), ) - elif record.get('item') and record.get('path'): + elif record.get("item") and record.get("path"): # it's petabox link; fetch via HTTP start = time.time() - ia_resp = requests.get("https://archive.org/serve/{}/{}".format( - record['item'], record['path'])) + ia_resp = requests.get( + "https://archive.org/serve/{}/{}".format(record["item"], record["path"]) + ) petabox_sec = time.time() - start try: ia_resp.raise_for_status() @@ -181,7 +189,8 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): blob = ia_resp.content else: raise ValueError( - "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed") + "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed" + ) if not blob: return dict( key=default_key, @@ -201,29 +210,31 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): class MultiprocessWrapper(SandcrawlerWorker): - def __init__(self, - worker: SandcrawlerWorker, - sink: Optional[SandcrawlerWorker] = None, - jobs: Optional[int] = None): + def __init__( + self, + worker: SandcrawlerWorker, + sink: Optional[SandcrawlerWorker] = None, + jobs: Optional[int] = None, + ): self.counts = Counter() self.worker = worker self.sink = sink self.pool = multiprocessing.pool.Pool(jobs) def push_batch(self, tasks: List[Any]) -> List[Any]: - self.counts['total'] += len(tasks) + self.counts["total"] += len(tasks) print("... processing batch of: {}".format(len(tasks)), file=sys.stderr) results = self.pool.map(self.worker.process, tasks) for result in results: if not result: - self.counts['failed'] += 1 + self.counts["failed"] += 1 return [] - elif type(result) == dict and 'status' in result and len(result['status']) < 32: - self.counts[result['status']] += 1 + elif type(result) == dict and "status" in result and len(result["status"]) < 32: + self.counts[result["status"]] += 1 if self.sink: self.sink.push_record(result) - self.counts['pushed'] += 1 + self.counts["pushed"] += 1 else: print(json.dumps(result)) return results @@ -243,6 +254,7 @@ class BlackholeSink(SandcrawlerWorker): Useful for tests. """ + def push_record(self, task: Any, key: Optional[str] = None) -> Any: return @@ -257,12 +269,14 @@ class KafkaSink(SandcrawlerWorker): self.produce_topic = produce_topic self.kafka_hosts = kafka_hosts - config = self.producer_config({ - 'bootstrap.servers': kafka_hosts, - 'message.max.bytes': 30000000, # ~30 MBytes; broker is ~50 MBytes - 'api.version.request': True, - 'api.version.fallback.ms': 0, - }) + config = self.producer_config( + { + "bootstrap.servers": kafka_hosts, + "message.max.bytes": 30000000, # ~30 MBytes; broker is ~50 MBytes + "api.version.request": True, + "api.version.fallback.ms": 0, + } + ) self.producer = Producer(config) @staticmethod @@ -275,27 +289,29 @@ class KafkaSink(SandcrawlerWorker): def producer_config(self, kafka_config: dict) -> dict: config = kafka_config.copy() - config.update({ - 'delivery.report.only.error': True, - 'default.topic.config': { - 'message.timeout.ms': 30000, - 'request.required.acks': -1, # all brokers must confirm + config.update( + { + "delivery.report.only.error": True, + "default.topic.config": { + "message.timeout.ms": 30000, + "request.required.acks": -1, # all brokers must confirm + }, } - }) + ) return config def push_record(self, msg: Any, key: Optional[str] = None) -> Any: - self.counts['total'] += 1 + self.counts["total"] += 1 if type(msg) == dict: - if not key and 'key' in msg: - key = msg['key'] + if not key and "key" in msg: + key = msg["key"] msg = json.dumps(msg) if type(msg) == str: - msg = msg.encode('utf-8') + msg = msg.encode("utf-8") assert type(msg) == bytes self.producer.produce(self.produce_topic, msg, key=key, on_delivery=self._fail_fast) - self.counts['produced'] += 1 + self.counts["produced"] += 1 # check for errors etc self.producer.poll(0) @@ -314,19 +330,22 @@ class KafkaCompressSink(KafkaSink): """ Variant of KafkaSink for large documents. Used for, eg, GROBID output. """ + def producer_config(self, kafka_config: Dict[str, Any]) -> Dict[str, Any]: config = kafka_config.copy() - config.update({ - 'compression.codec': 'gzip', - 'retry.backoff.ms': 250, - 'linger.ms': 1000, - 'batch.num.messages': 50, - 'delivery.report.only.error': True, - 'default.topic.config': { - 'message.timeout.ms': 30000, - 'request.required.acks': -1, # all brokers must confirm + config.update( + { + "compression.codec": "gzip", + "retry.backoff.ms": 250, + "linger.ms": 1000, + "batch.num.messages": 50, + "delivery.report.only.error": True, + "default.topic.config": { + "message.timeout.ms": 30000, + "request.required.acks": -1, # all brokers must confirm + }, } - }) + ) return config @@ -335,6 +354,7 @@ class RecordPusher: Base class for different record sources to be pushed into workers. Pretty trivial interface, just wraps an importer and pushes records in to it. """ + def __init__(self, worker: SandcrawlerWorker, **kwargs): self.counts: Counter = Counter() self.worker: SandcrawlerWorker = worker @@ -356,7 +376,7 @@ class JsonLinePusher(RecordPusher): self.counts = Counter() self.worker = worker self.json_file = json_file - self.batch_size = kwargs.get('batch_size', None) + self.batch_size = kwargs.get("batch_size", None) if self.batch_size in (0, 1): self.batch_size = None @@ -365,24 +385,24 @@ class JsonLinePusher(RecordPusher): for line in self.json_file: if not line: continue - self.counts['total'] += 1 + self.counts["total"] += 1 try: record = json.loads(line) except json.decoder.JSONDecodeError: - self.counts['error-json-decode'] += 1 + self.counts["error-json-decode"] += 1 continue if self.batch_size: batch.append(record) if len(batch) >= self.batch_size: self.worker.push_batch(batch) - self.counts['pushed'] += len(batch) + self.counts["pushed"] += len(batch) batch = [] else: self.worker.push_record(record) - self.counts['pushed'] += 1 + self.counts["pushed"] += 1 if self.batch_size and batch: self.worker.push_batch(batch) - self.counts['pushed'] += len(batch) + self.counts["pushed"] += len(batch) batch = [] self.worker.finish() print("JSON lines pushed: {}".format(self.counts), file=sys.stderr) @@ -394,10 +414,10 @@ class CdxLinePusher(RecordPusher): self.counts = Counter() self.worker = worker self.cdx_file = cdx_file - self.filter_http_statuses = kwargs.get('filter_http_statuses', None) - self.filter_mimetypes = kwargs.get('filter_mimetypes', None) - self.allow_octet_stream = kwargs.get('allow_octet_stream', False) - self.batch_size = kwargs.get('batch_size', None) + self.filter_http_statuses = kwargs.get("filter_http_statuses", None) + self.filter_mimetypes = kwargs.get("filter_mimetypes", None) + self.allow_octet_stream = kwargs.get("allow_octet_stream", False) + self.batch_size = kwargs.get("batch_size", None) if self.batch_size in (0, 1): self.batch_size = None @@ -406,30 +426,32 @@ class CdxLinePusher(RecordPusher): for line in self.cdx_file: if not line: continue - self.counts['total'] += 1 + self.counts["total"] += 1 record = parse_cdx_line(line, normalize=True) if not record: - self.counts['skip-parse'] += 1 + self.counts["skip-parse"] += 1 continue - if self.filter_http_statuses and record[ - 'http_status'] not in self.filter_http_statuses: - self.counts['skip-http_status'] += 1 + if ( + self.filter_http_statuses + and record["http_status"] not in self.filter_http_statuses + ): + self.counts["skip-http_status"] += 1 continue - if self.filter_mimetypes and record['mimetype'] not in self.filter_mimetypes: - self.counts['skip-mimetype'] += 1 + if self.filter_mimetypes and record["mimetype"] not in self.filter_mimetypes: + self.counts["skip-mimetype"] += 1 continue if self.batch_size: batch.append(record) if len(batch) >= self.batch_size: self.worker.push_batch(batch) - self.counts['pushed'] += len(batch) + self.counts["pushed"] += len(batch) batch = [] else: self.worker.push_record(record) - self.counts['pushed'] += 1 + self.counts["pushed"] += 1 if self.batch_size and batch: self.worker.push_batch(batch) - self.counts['pushed'] += len(batch) + self.counts["pushed"] += len(batch) batch = [] self.worker.finish() print("CDX lines pushed: {}".format(self.counts), file=sys.stderr) @@ -442,33 +464,33 @@ class ZipfilePusher(RecordPusher): self.worker = worker self.filter_suffix = ".pdf" self.zipfile_path = zipfile_path - self.batch_size = kwargs.get('batch_size', None) + self.batch_size = kwargs.get("batch_size", None) if self.batch_size in (0, 1): self.batch_size = None def run(self) -> Counter: batch = [] - with zipfile.ZipFile(self.zipfile_path, 'r') as archive: + with zipfile.ZipFile(self.zipfile_path, "r") as archive: for zipinfo in archive.infolist(): if not zipinfo.filename.endswith(self.filter_suffix): continue - self.counts['total'] += 1 + self.counts["total"] += 1 # NB doesn't really extract the file, just gives you a stream (file-like-object) for reading it - flo = archive.open(zipinfo, 'r') - data = flo.read(2**32) + flo = archive.open(zipinfo, "r") + data = flo.read(2 ** 32) flo.close() if self.batch_size: batch.append(data) if len(batch) >= self.batch_size: self.worker.push_batch(batch) - self.counts['pushed'] += len(batch) + self.counts["pushed"] += len(batch) batch = [] else: self.worker.push_record(data) - self.counts['pushed'] += 1 + self.counts["pushed"] += 1 if self.batch_size and batch: self.worker.push_batch(batch) - self.counts['pushed'] += len(batch) + self.counts["pushed"] += len(batch) batch = [] self.worker.finish() print("ZIP PDFs pushed: {}".format(self.counts), file=sys.stderr) @@ -476,8 +498,14 @@ class ZipfilePusher(RecordPusher): class KafkaJsonPusher(RecordPusher): - def __init__(self, worker: SandcrawlerWorker, kafka_hosts: str, consume_topic: str, - group: str, **kwargs): + def __init__( + self, + worker: SandcrawlerWorker, + kafka_hosts: str, + consume_topic: str, + group: str, + **kwargs + ): self.counts = Counter() self.worker = worker self.consumer = make_kafka_consumer( @@ -485,14 +513,14 @@ class KafkaJsonPusher(RecordPusher): consume_topic, group, ) - self.push_batches = kwargs.get('push_batches', False) - self.raw_records = kwargs.get('raw_records', False) - self.poll_interval = kwargs.get('poll_interval', 5.0) - self.batch_size = kwargs.get('batch_size', 100) + self.push_batches = kwargs.get("push_batches", False) + self.raw_records = kwargs.get("raw_records", False) + self.poll_interval = kwargs.get("poll_interval", 5.0) + self.batch_size = kwargs.get("batch_size", 100) if self.batch_size in (0, 1): self.batch_size = 1 - self.batch_worker = kwargs.get('batch_worker', False) - self.process_timeout_sec = kwargs.get('process_timeout_sec', 300) + self.batch_worker = kwargs.get("batch_worker", False) + self.process_timeout_sec = kwargs.get("process_timeout_sec", 300) def run(self) -> Counter: while True: @@ -502,11 +530,15 @@ class KafkaJsonPusher(RecordPusher): # case where there there is one update and thousands of creates; # update would be lingering in worker, and if worker crashed # never created. Not great. - batch = self.consumer.consume(num_messages=self.batch_size, - timeout=self.poll_interval) - print("... got {} kafka messages ({}sec poll interval)".format( - len(batch), self.poll_interval), - file=sys.stderr) + batch = self.consumer.consume( + num_messages=self.batch_size, timeout=self.poll_interval + ) + print( + "... got {} kafka messages ({}sec poll interval)".format( + len(batch), self.poll_interval + ), + file=sys.stderr, + ) if not batch: # TODO: could have some larger timeout here and # self.worker.finish() if it's been more than, eg, a couple @@ -518,14 +550,14 @@ class KafkaJsonPusher(RecordPusher): raise KafkaException(msg.error()) # ... then process if self.push_batches: - self.counts['total'] += len(batch) - records = [json.loads(msg.value().decode('utf-8')) for msg in batch] + self.counts["total"] += len(batch) + records = [json.loads(msg.value().decode("utf-8")) for msg in batch] self.worker.push_batch(records) - self.counts['pushed'] += len(batch) + self.counts["pushed"] += len(batch) print("Import counts: {}".format(self.worker.counts), file=sys.stderr) else: for msg in batch: - self.counts['total'] += 1 + self.counts["total"] += 1 if self.raw_records: # In this mode, pass the Kafka message as bytes through # without decoding as JSON. Eg, for thumbnails (where @@ -533,7 +565,7 @@ class KafkaJsonPusher(RecordPusher): # from the message) record = msg.value() else: - record = json.loads(msg.value().decode('utf-8')) + record = json.loads(msg.value().decode("utf-8")) # This complex bit of code implements backoff/backpressure # in a way that will not cause this Kafka consumer to lose # partition assignments (resulting in a rebalance). This @@ -543,9 +575,9 @@ class KafkaJsonPusher(RecordPusher): while not done: try: # use timeouts; don't want kafka itself to timeout - self.worker.push_record_timeout(record, - key=msg.key(), - timeout=self.process_timeout_sec) + self.worker.push_record_timeout( + record, key=msg.key(), timeout=self.process_timeout_sec + ) break except SandcrawlerBackoffError as be: print("Backing off for 200 seconds: {}".format(be)) @@ -557,8 +589,8 @@ class KafkaJsonPusher(RecordPusher): assert not empty_batch time.sleep(5) self.consumer.resume(self.consumer.assignment()) - self.counts['pushed'] += 1 - if self.counts['total'] % 500 == 0: + self.counts["pushed"] += 1 + if self.counts["total"] % 500 == 0: print("Import counts: {}".format(self.worker.counts), file=sys.stderr) for msg in batch: # locally store offsets of processed messages; will be @@ -589,25 +621,25 @@ def make_kafka_consumer(hosts: str, consume_topic: str, group: str) -> Consumer: print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) - #print("Kafka consumer commit successful") + # print("Kafka consumer commit successful") pass # previously, using pykafka - #auto_commit_enable=True, - #auto_commit_interval_ms=30000, # 30 seconds + # auto_commit_enable=True, + # auto_commit_interval_ms=30000, # 30 seconds conf = { - 'bootstrap.servers': hosts, - 'group.id': group, - 'on_commit': fail_fast, + "bootstrap.servers": hosts, + "group.id": group, + "on_commit": fail_fast, # messages don't have offset marked as stored until processed, # but we do auto-commit stored offsets to broker - 'enable.auto.offset.store': False, - 'enable.auto.commit': True, + "enable.auto.offset.store": False, + "enable.auto.commit": True, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 6min) - 'max.poll.interval.ms': 360000, - 'default.topic.config': { - 'auto.offset.reset': 'latest', + "max.poll.interval.ms": 360000, + "default.topic.config": { + "auto.offset.reset": "latest", }, } @@ -615,8 +647,9 @@ def make_kafka_consumer(hosts: str, consume_topic: str, group: str) -> Consumer: for p in partitions: if p.error: raise KafkaException(p.error) - print("Kafka partitions rebalanced: {} / {}".format(consumer, partitions), - file=sys.stderr) + print( + "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), file=sys.stderr + ) consumer = Consumer(conf) # NOTE: it's actually important that topic_name *not* be bytes (UTF-8 diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 3e35807..d42cd8c 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -16,7 +16,7 @@ from sandcrawler.persist import PersistHtmlTeiXmlWorker, PersistXmlDocWorker # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable try: - git_sha = raven.fetch_git_sha('..') + git_sha = raven.fetch_git_sha("..") except Exception: git_sha = None sentry_client = raven.Client(release=git_sha) @@ -29,8 +29,12 @@ def run_grobid_extract(args): kafka_hosts=args.kafka_hosts, produce_topic=produce_topic, ) - grobid_client = GrobidClient(host_url=args.grobid_host, ) - wayback_client = WaybackClient(host_url=args.grobid_host, ) + grobid_client = GrobidClient( + host_url=args.grobid_host, + ) + wayback_client = WaybackClient( + host_url=args.grobid_host, + ) worker = GrobidWorker( grobid_client=grobid_client, wayback_client=wayback_client, @@ -58,7 +62,9 @@ def run_pdf_extract(args): kafka_hosts=args.kafka_hosts, produce_topic=thumbnail_topic, ) - wayback_client = WaybackClient(host_url=args.grobid_host, ) + wayback_client = WaybackClient( + host_url=args.grobid_host, + ) worker = PdfExtractWorker( wayback_client=wayback_client, sink=pdftext_sink, @@ -187,7 +193,9 @@ def run_persist_html_teixml(args: argparse.Namespace) -> None: def run_persist_pdftrio(args): consume_topic = "sandcrawler-{}.pdftrio-output".format(args.env) - worker = PersistPdfTrioWorker(db_url=args.db_url, ) + worker = PersistPdfTrioWorker( + db_url=args.db_url, + ) pusher = KafkaJsonPusher( worker=worker, kafka_hosts=args.kafka_hosts, @@ -226,7 +234,9 @@ def run_ingest_file(args): kafka_hosts=args.kafka_hosts, produce_topic=grobid_topic, ) - grobid_client = GrobidClient(host_url=args.grobid_host, ) + grobid_client = GrobidClient( + host_url=args.grobid_host, + ) pdftext_sink = KafkaCompressSink( kafka_hosts=args.kafka_hosts, produce_topic=pdftext_topic, @@ -267,7 +277,9 @@ def run_ingest_file(args): def run_persist_ingest_file(args): consume_topic = "sandcrawler-{}.ingest-file-results".format(args.env) - worker = PersistIngestFileResultWorker(db_url=args.db_url, ) + worker = PersistIngestFileResultWorker( + db_url=args.db_url, + ) pusher = KafkaJsonPusher( worker=worker, kafka_hosts=args.kafka_hosts, @@ -281,114 +293,128 @@ def run_persist_ingest_file(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--kafka-hosts', - default="localhost:9092", - help="list of Kafka brokers (host/port) to use") - parser.add_argument('--env', - default="dev", - help="Kafka topic namespace to use (eg, prod, qa, dev)") - parser.add_argument('--grobid-host', - default="http://grobid.qa.fatcat.wiki", - help="GROBID API host/port") - parser.add_argument('--db-url', - help="postgresql database connection string", - default="postgres:///sandcrawler") - parser.add_argument('--s3-url', help="S3 (seaweedfs) backend URL", default="localhost:9000") - parser.add_argument('--s3-access-key', - help="S3 (seaweedfs) credential", - default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') - or os.environ.get('MINIO_ACCESS_KEY')) - parser.add_argument('--s3-secret-key', - help="S3 (seaweedfs) credential", - default=os.environ.get('SANDCRAWLER_BLOB_SECRET_KEY') - or os.environ.get('MINIO_SECRET_KEY')) - parser.add_argument('--s3-bucket', - help="S3 (seaweedfs) bucket to persist into", - default="sandcrawler-dev") + parser.add_argument( + "--kafka-hosts", + default="localhost:9092", + help="list of Kafka brokers (host/port) to use", + ) + parser.add_argument( + "--env", default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)" + ) + parser.add_argument( + "--grobid-host", default="http://grobid.qa.fatcat.wiki", help="GROBID API host/port" + ) + parser.add_argument( + "--db-url", + help="postgresql database connection string", + default="postgres:///sandcrawler", + ) + parser.add_argument("--s3-url", help="S3 (seaweedfs) backend URL", default="localhost:9000") + parser.add_argument( + "--s3-access-key", + help="S3 (seaweedfs) credential", + default=os.environ.get("SANDCRAWLER_BLOB_ACCESS_KEY") + or os.environ.get("MINIO_ACCESS_KEY"), + ) + parser.add_argument( + "--s3-secret-key", + help="S3 (seaweedfs) credential", + default=os.environ.get("SANDCRAWLER_BLOB_SECRET_KEY") + or os.environ.get("MINIO_SECRET_KEY"), + ) + parser.add_argument( + "--s3-bucket", help="S3 (seaweedfs) bucket to persist into", default="sandcrawler-dev" + ) subparsers = parser.add_subparsers() sub_grobid_extract = subparsers.add_parser( - 'grobid-extract', - help= - "daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka" + "grobid-extract", + help="daemon that consumes CDX JSON objects from Kafka, uses GROBID to extract XML, pushes to Kafka", ) sub_grobid_extract.set_defaults(func=run_grobid_extract) sub_pdf_extract = subparsers.add_parser( - 'pdf-extract', - help= - "daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka" + "pdf-extract", + help="daemon that consumes CDX JSON objects from Kafka, extracts text and thumbnail, pushes to Kafka", ) sub_pdf_extract.set_defaults(func=run_pdf_extract) sub_persist_grobid = subparsers.add_parser( - 'persist-grobid', - help= - "daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres" + "persist-grobid", + help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres", + ) + sub_persist_grobid.add_argument( + "--s3-only", + action="store_true", + help="only upload TEI-XML to S3 (don't write to database)", ) - sub_persist_grobid.add_argument('--s3-only', - action='store_true', - help="only upload TEI-XML to S3 (don't write to database)") sub_persist_grobid.add_argument( - '--db-only', - action='store_true', - help="only write status to database (don't upload TEI-XML to S3)") + "--db-only", + action="store_true", + help="only write status to database (don't upload TEI-XML to S3)", + ) sub_persist_grobid.set_defaults(func=run_persist_grobid) sub_persist_pdftext = subparsers.add_parser( - 'persist-pdftext', - help= - "daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres" + "persist-pdftext", + help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres", + ) + sub_persist_pdftext.add_argument( + "--s3-only", + action="store_true", + help="only upload TEI-XML to S3 (don't write to database)", ) - sub_persist_pdftext.add_argument('--s3-only', - action='store_true', - help="only upload TEI-XML to S3 (don't write to database)") sub_persist_pdftext.add_argument( - '--db-only', - action='store_true', - help="only write status to database (don't upload TEI-XML to S3)") + "--db-only", + action="store_true", + help="only write status to database (don't upload TEI-XML to S3)", + ) sub_persist_pdftext.set_defaults(func=run_persist_pdftext) sub_persist_thumbnail = subparsers.add_parser( - 'persist-thumbnail', - help= - "daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres" + "persist-thumbnail", + help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres", ) sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail) sub_persist_xml_doc = subparsers.add_parser( - 'persist-xml-doc', - help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket" + "persist-xml-doc", + help="daemon that consumes xml-doc output from Kafka and pushes to S3 (seaweedfs) bucket", ) sub_persist_xml_doc.set_defaults(func=run_persist_xml_doc) sub_persist_html_teixml = subparsers.add_parser( - 'persist-html-teixml', - help= - "daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket" + "persist-html-teixml", + help="daemon that consumes html-teixml output from Kafka and pushes to S3 (seaweedfs) bucket", ) sub_persist_html_teixml.set_defaults(func=run_persist_html_teixml) sub_persist_pdftrio = subparsers.add_parser( - 'persist-pdftrio', - help="daemon that consumes pdftrio output from Kafka and pushes to postgres") + "persist-pdftrio", + help="daemon that consumes pdftrio output from Kafka and pushes to postgres", + ) sub_persist_pdftrio.set_defaults(func=run_persist_pdftrio) sub_ingest_file = subparsers.add_parser( - 'ingest-file', - help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka") - sub_ingest_file.add_argument('--bulk', - action='store_true', - help="consume from bulk kafka topic (eg, for ingest backfill)") + "ingest-file", + help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka", + ) + sub_ingest_file.add_argument( + "--bulk", + action="store_true", + help="consume from bulk kafka topic (eg, for ingest backfill)", + ) sub_ingest_file.add_argument( - '--priority', - action='store_true', - help="consume from priority kafka topic (eg, for SPN requests)") + "--priority", + action="store_true", + help="consume from priority kafka topic (eg, for SPN requests)", + ) sub_ingest_file.set_defaults(func=run_ingest_file) sub_persist_ingest_file = subparsers.add_parser( - 'persist-ingest-file', - help="daemon that consumes ingest-file output from Kafka and pushes to postgres") + "persist-ingest-file", + help="daemon that consumes ingest-file output from Kafka and pushes to postgres", + ) sub_persist_ingest_file.set_defaults(func=run_persist_ingest_file) args = parser.parse_args() @@ -399,5 +425,5 @@ def main(): args.func(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py index 9cc9055..4561541 100755 --- a/python/scripts/arabesque2ingestrequest.py +++ b/python/scripts/arabesque2ingestrequest.py @@ -21,43 +21,48 @@ def run(args): if not l.strip(): continue row = json.loads(l) - if not row['hit']: + if not row["hit"]: continue request = { - 'base_url': row['final_url'], - 'ingest_type': args.ingest_type, - 'link_source': args.link_source, - 'link_source_id': row['identifier'], - 'ingest_request_source': args.ingest_request_source, - 'ext_ids': { - args.extid_type: row['identifier'], + "base_url": row["final_url"], + "ingest_type": args.ingest_type, + "link_source": args.link_source, + "link_source_id": row["identifier"], + "ingest_request_source": args.ingest_request_source, + "ext_ids": { + args.extid_type: row["identifier"], }, } if args.release_stage: - assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', - 'update') - request['release_stage'] = args.release_stage + assert args.release_stage in ( + "published", + "submitted", + "accepted", + "draft", + "update", + ) + request["release_stage"] = args.release_stage print("{}".format(json.dumps(request, sort_keys=True))) def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--link-source', - required=True, - help="link_source to include in request") - parser.add_argument('--extid-type', required=True, help="extid to encode identifier as") - parser.add_argument('--ingest-type', - default="pdf", - help="ingest type (pdf, html, xml, etc)") - parser.add_argument('--ingest-request-source', - default="arabesque", - help="to include in request") - parser.add_argument('--release-stage', default=None, help="to include in request") - parser.add_argument('json_file', - help="arabesque output file to use", - type=argparse.FileType('r')) + parser.add_argument( + "--link-source", required=True, help="link_source to include in request" + ) + parser.add_argument("--extid-type", required=True, help="extid to encode identifier as") + parser.add_argument( + "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)" + ) + parser.add_argument( + "--ingest-request-source", default="arabesque", help="to include in request" + ) + parser.add_argument("--release-stage", default=None, help="to include in request") + parser.add_argument( + "json_file", help="arabesque output file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() @@ -65,5 +70,5 @@ def main(): run(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py index 83c04e3..6328f52 100755 --- a/python/scripts/archiveorg_fileset.py +++ b/python/scripts/archiveorg_fileset.py @@ -16,32 +16,31 @@ from typing import Any import internetarchive FORMAT_TO_MIMETYPE = { - 'BZIP': 'application/x-bzip', - 'BZIP2': 'application/x-bzip2', - 'ZIP': 'application/zip', - 'GZIP': 'application/gzip', - 'RAR': 'application/vnd.rar', - 'TAR': 'application/x-tar', - '7z': 'application/x-7z-compressed', - 'HTML': 'text/html', - 'Text': 'text/plain', - 'PDF': 'application/pdf', - 'CSV': 'text/csv', - 'XML': 'application/xml', - 'JSON': 'application/json', - + "BZIP": "application/x-bzip", + "BZIP2": "application/x-bzip2", + "ZIP": "application/zip", + "GZIP": "application/gzip", + "RAR": "application/vnd.rar", + "TAR": "application/x-tar", + "7z": "application/x-7z-compressed", + "HTML": "text/html", + "Text": "text/plain", + "PDF": "application/pdf", + "CSV": "text/csv", + "XML": "application/xml", + "JSON": "application/json", #'application/msword (.doc)', # .doc #'application/vnd.openxmlformats-officedocument.wordprocessingml.document', # .docx #'application/vnd.ms-excel', # .xls #'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', # .xlsx - 'MP3': 'audio/mpeg', # .mp3 - 'MP4': 'video/mp4', # .mp4 - 'MPEG': 'video/mpeg', # .mpeg - 'JPEG': 'image/jpeg', - 'GIF': 'image/gif', - 'PNG': 'image/png', - 'TIFF': 'image/tiff', - 'Unknown': None, + "MP3": "audio/mpeg", # .mp3 + "MP4": "video/mp4", # .mp4 + "MPEG": "video/mpeg", # .mpeg + "JPEG": "image/jpeg", + "GIF": "image/gif", + "PNG": "image/png", + "TIFF": "image/tiff", + "Unknown": None, } @@ -49,22 +48,22 @@ def want_file(f: dict, item_name: str) -> bool: """ Filters IA API files """ - if f.source != 'original': + if f.source != "original": return False for suffix in [ - '_meta.sqlite', - '_archive.torrent', - '_itemimage.jpg', - '_meta.xml', - '_thumb.png', - '_files.xml', + "_meta.sqlite", + "_archive.torrent", + "_itemimage.jpg", + "_meta.xml", + "_thumb.png", + "_files.xml", ]: if f.name == item_name + suffix or f.name == item_name.lower() + suffix: return False - if f.name.startswith('_'): + if f.name.startswith("_"): return False - if item_name.startswith('academictorrents_'): - for suffix in ['_academictorrents.torrent', '_academictorrents_torrent.txt', '.bib']: + if item_name.startswith("academictorrents_"): + for suffix in ["_academictorrents.torrent", "_academictorrents_torrent.txt", ".bib"]: if f.name == item_name + suffix: return False return True @@ -77,36 +76,38 @@ def parse_file(f: dict) -> dict: assert f.name and f.sha1 and f.md5 assert f.name is not None mf = { - 'path': f.name, - 'size': int(f.size), - 'sha1': f.sha1, - 'md5': f.md5, + "path": f.name, + "size": int(f.size), + "sha1": f.sha1, + "md5": f.md5, } # TODO: will disable this hard check eventually and replace with: - #mimetype = FORMAT_TO_MIMETYPE.get(f.format) + # mimetype = FORMAT_TO_MIMETYPE.get(f.format) mimetype = FORMAT_TO_MIMETYPE[f.format] if mimetype: - mf['extra'] = dict(mimetype=mimetype) + mf["extra"] = dict(mimetype=mimetype) return mf def item_to_fileset(item_name: str, release_id: str, session: internetarchive.ArchiveSession): print(f"processing item={item_name} release_id={release_id}", file=sys.stderr) - if release_id.startswith('release_'): + if release_id.startswith("release_"): release_id = release_id[9:] assert len(release_id) == 26 item = session.get_item(item_name) - assert item.metadata['mediatype'] not in ['collection', 'web'] + assert item.metadata["mediatype"] not in ["collection", "web"] item_files = item.get_files(on_the_fly=False) manifest = [parse_file(f) for f in item_files if want_file(f, item_name)] fileset = { - 'manifest': manifest, - 'urls': [{ - 'rel': 'archive', - 'url': f'https://archive.org/download/{item_name}/', - }, ], - 'release_ids': [release_id], - #extra={}, + "manifest": manifest, + "urls": [ + { + "rel": "archive", + "url": f"https://archive.org/download/{item_name}/", + }, + ], + "release_ids": [release_id], + # extra={}, } print(json.dumps(fileset)) return fileset @@ -123,12 +124,12 @@ def main(): line = line.strip() if not line: continue - fields = line.split('\t') + fields = line.split("\t") assert len(fields) == 2 item_name = fields[0] release_id = fields[1] item_to_fileset(item_name, release_id=release_id, session=session) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py index aa78aec..0b60da3 100755 --- a/python/scripts/cdx_collection.py +++ b/python/scripts/cdx_collection.py @@ -29,7 +29,7 @@ def run(): collection = sys.argv[1] # Check collection name is clean - assert collection.replace('_', '').replace('-', '').replace('.', '').isalnum() + assert collection.replace("_", "").replace("-", "").replace(".", "").isalnum() tempdir = tempfile.mkdtemp() print("Looking up collection: {}".format(collection)) @@ -45,15 +45,17 @@ def run(): status = True errors = [] for item in item_list: - item = item['identifier'] + item = item["identifier"] # TODO: error handling try: - ret = ia.download(item, - files=[item + '.cdx.gz'], - verbose=True, - destdir=tempdir, - no_directory=True, - retries=1000) + ret = ia.download( + item, + files=[item + ".cdx.gz"], + verbose=True, + destdir=tempdir, + no_directory=True, + retries=1000, + ) status = ret and status except requests.exceptions.ReadTimeout as rt: print(str(rt), file=sys.stderr) @@ -67,14 +69,14 @@ def run(): # Combine files print("Merging and re-compressing all CDX files...") - #subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir), - subprocess.run('zcat {0}/*.cdx.gz | gzip > {0}/combined.gz'.format(tempdir), shell=True) + # subprocess.run('zcat {0}/*.cdx.gz | pigz > {0}/combined.gz'.format(tempdir), + subprocess.run("zcat {0}/*.cdx.gz | gzip > {0}/combined.gz".format(tempdir), shell=True) # Move and cleanup - shutil.move('{}/combined.gz'.format(tempdir), '{}.cdx.gz'.format(collection)) + shutil.move("{}/combined.gz".format(tempdir), "{}.cdx.gz".format(collection)) print("Done!") -if __name__ == '__main__': +if __name__ == "__main__": run() diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py index 4714b60..e3bf4f0 100755 --- a/python/scripts/covid2ingestrequest.py +++ b/python/scripts/covid2ingestrequest.py @@ -18,38 +18,44 @@ def canon(s): def transform_cnki(obj): requests = [] - assert obj['cnki_id'] + assert obj["cnki_id"] requests = [] - requests.append({ - 'base_url': canon(obj['info_url']), - 'ingest_type': 'pdf', - 'link_source': 'cnki_covid19', - 'link_source_id': obj['cnki_id'], - 'ingest_request_source': 'scrape-covid19', - }) - if 'read_url' in obj: - requests.append({ - 'base_url': canon(obj['read_url']), - 'ingest_type': 'pdf', # actually HTML - 'link_source': 'cnki_covid19', - 'link_source_id': obj['cnki_id'], - 'ingest_request_source': 'scrape-covid19', - }) + requests.append( + { + "base_url": canon(obj["info_url"]), + "ingest_type": "pdf", + "link_source": "cnki_covid19", + "link_source_id": obj["cnki_id"], + "ingest_request_source": "scrape-covid19", + } + ) + if "read_url" in obj: + requests.append( + { + "base_url": canon(obj["read_url"]), + "ingest_type": "pdf", # actually HTML + "link_source": "cnki_covid19", + "link_source_id": obj["cnki_id"], + "ingest_request_source": "scrape-covid19", + } + ) return requests def transform_wanfang(obj): - assert obj['wanfang_id'] - return [{ - 'base_url': canon(obj['url']), - 'ingest_type': 'pdf', - 'link_source': 'wanfang_covid19', - 'link_source_id': obj['wanfang_id'], - 'ingest_request_source': 'scrape-covid19', - }] + assert obj["wanfang_id"] + return [ + { + "base_url": canon(obj["url"]), + "ingest_type": "pdf", + "link_source": "wanfang_covid19", + "link_source_id": obj["wanfang_id"], + "ingest_request_source": "scrape-covid19", + } + ] def run(args): @@ -58,9 +64,9 @@ def run(args): continue row = json.loads(l) - if 'wanfang_id' in row: + if "wanfang_id" in row: requests = transform_wanfang(row) or [] - elif 'cnki_id' in row: + elif "cnki_id" in row: requests = transform_cnki(row) or [] else: continue @@ -70,9 +76,9 @@ def run(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="COVID-19 metadata file to use", - type=argparse.FileType('r')) + parser.add_argument( + "json_file", help="COVID-19 metadata file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() @@ -80,5 +86,5 @@ def main(): run(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py index 3b53235..3c769cf 100755 --- a/python/scripts/deliver_dumpgrobid_to_s3.py +++ b/python/scripts/deliver_dumpgrobid_to_s3.py @@ -45,38 +45,38 @@ def b32_hex(s): s = s[5:] if len(s) != 32: return s - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") -class DeliverDumpGrobidS3(): +class DeliverDumpGrobidS3: def __init__(self, s3_bucket, **kwargs): self.rstore = None self.count = Counter() self.s3_bucket = s3_bucket - self.s3_prefix = kwargs.get('s3_prefix', 'grobid/') - self.s3_suffix = kwargs.get('s3_suffix', '.tei.xml') - self.s3_storage_class = kwargs.get('s3_storage_class', 'STANDARD') - self.s3 = boto3.resource('s3') + self.s3_prefix = kwargs.get("s3_prefix", "grobid/") + self.s3_suffix = kwargs.get("s3_suffix", ".tei.xml") + self.s3_storage_class = kwargs.get("s3_storage_class", "STANDARD") + self.s3 = boto3.resource("s3") self.bucket = self.s3.Bucket(self.s3_bucket) def run(self, dump_file): sys.stderr.write("Starting...\n") for line in dump_file: - line = line.strip().split('\t') + line = line.strip().split("\t") if len(line) != 2: - self.count['skip-line'] += 1 + self.count["skip-line"] += 1 continue sha1_hex, grobid_json = line[0], line[1] if len(sha1_hex) != 40: sha1_hex = b32_hex(sha1_hex) assert len(sha1_hex) == 40 grobid = json.loads(grobid_json) - tei_xml = grobid.get('tei_xml') + tei_xml = grobid.get("tei_xml") if not tei_xml: print("{}\tskip empty".format(sha1_hex)) - self.count['skip-empty'] += 1 + self.count["skip-empty"] += 1 continue - tei_xml = tei_xml.encode('utf-8') + tei_xml = tei_xml.encode("utf-8") # upload to AWS S3 obj = self.bucket.put_object( Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix), @@ -84,7 +84,7 @@ class DeliverDumpGrobidS3(): StorageClass=self.s3_storage_class, ) print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(tei_xml))) - self.count['success-s3'] += 1 + self.count["success-s3"] += 1 sys.stderr.write("{}\n".format(self.count)) @@ -92,31 +92,32 @@ class DeliverDumpGrobidS3(): def main(): parser = argparse.ArgumentParser() - parser.add_argument('--s3-bucket', - required=True, - type=str, - help='AWS S3 bucket to upload into') - parser.add_argument('--s3-prefix', - type=str, - default="grobid/", - help='key prefix for items created in bucket') - parser.add_argument('--s3-suffix', - type=str, - default=".tei.xml", - help='file suffix for created objects') - parser.add_argument('--s3-storage-class', - type=str, - default="STANDARD", - help='AWS S3 storage class (redundancy) to use') - parser.add_argument('dump_file', - help="TSV/JSON dump file", - default=sys.stdin, - type=argparse.FileType('r')) + parser.add_argument( + "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into" + ) + parser.add_argument( + "--s3-prefix", + type=str, + default="grobid/", + help="key prefix for items created in bucket", + ) + parser.add_argument( + "--s3-suffix", type=str, default=".tei.xml", help="file suffix for created objects" + ) + parser.add_argument( + "--s3-storage-class", + type=str, + default="STANDARD", + help="AWS S3 storage class (redundancy) to use", + ) + parser.add_argument( + "dump_file", help="TSV/JSON dump file", default=sys.stdin, type=argparse.FileType("r") + ) args = parser.parse_args() worker = DeliverDumpGrobidS3(**args.__dict__) worker.run(args.dump_file) -if __name__ == '__main__': # pragma: no cover +if __name__ == "__main__": # pragma: no cover main() diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py index ca19b97..fcaf51f 100755 --- a/python/scripts/deliver_gwb_to_disk.py +++ b/python/scripts/deliver_gwb_to_disk.py @@ -27,64 +27,76 @@ sentry_client = raven.Client() class DeliverGwbDisk: def __init__(self, disk_dir, **kwargs): - self.warc_uri_prefix = kwargs.get('warc_uri_prefix') + self.warc_uri_prefix = kwargs.get("warc_uri_prefix") self.rstore = None self.count = Counter() # /serve/ instead of /download/ doesn't record view count - self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/") # gwb library will fall back to reading from /opt/.petabox/webdata.secret - self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', - os.environ.get('PETABOX_WEBDATA_SECRET')) + self.petabox_webdata_secret = kwargs.get( + "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET") + ) self.disk_dir = disk_dir - self.disk_prefix = kwargs.get('disk_prefix', 'pdf/') - self.disk_suffix = kwargs.get('disk_suffix', '.pdf') + self.disk_prefix = kwargs.get("disk_prefix", "pdf/") + self.disk_suffix = kwargs.get("disk_suffix", ".pdf") def fetch_warc_content(self, warc_path, offset, c_size): warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: self.rstore = ResourceStore( - loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + loaderfactory=CDXLoaderFactory( + webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url, + ) + ) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: return None, dict( status="error", - reason="failed to load file contents from wayback/petabox (ResourceUnavailable)" + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)", ) except ValueError as ve: return None, dict( status="error", - reason="failed to load file contents from wayback/petabox (ValueError: {})". - format(ve)) + reason="failed to load file contents from wayback/petabox (ValueError: {})".format( + ve + ), + ) except EOFError as eofe: return None, dict( status="error", - reason="failed to load file contents from wayback/petabox (EOFError: {})". - format(eofe)) + reason="failed to load file contents from wayback/petabox (EOFError: {})".format( + eofe + ), + ) except TypeError as te: return None, dict( status="error", - reason= - "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)" - .format(te)) + reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format( + te + ), + ) # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. if gwb_record.get_status()[0] != 200: - return None, dict(status="error", - reason="archived HTTP response (WARC) was not 200", - warc_status=gwb_record.get_status()[0]) + return None, dict( + status="error", + reason="archived HTTP response (WARC) was not 200", + warc_status=gwb_record.get_status()[0], + ) try: raw_content = gwb_record.open_raw_content().read() except IncompleteRead as ire: return None, dict( status="error", - reason= - "failed to read actual file contents from wayback/petabox (IncompleteRead: {})". - format(ire)) + reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format( + ire + ), + ) return raw_content, None def run(self, manifest_file): @@ -95,47 +107,57 @@ class DeliverGwbDisk: os.makedirs(fpath, exist_ok=True) sys.stderr.write("Starting...\n") for line in manifest_file: - self.count['total'] += 1 - line = line.strip().split('\t') + self.count["total"] += 1 + line = line.strip().split("\t") if len(line) != 2: - self.count['skip-line'] += 1 + self.count["skip-line"] += 1 continue sha1_hex, cdx_json = line[0], line[1] assert len(sha1_hex) == 40 file_cdx = json.loads(cdx_json) # If warc is not item/file.(w)arc.gz form, skip it - if len(file_cdx['warc'].split('/')) != 2: - sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) - print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) - self.count['skip-warc'] += 1 + if len(file_cdx["warc"].split("/")) != 2: + sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"])) + print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"])) + self.count["skip-warc"] += 1 continue # fetch from GWB/petabox via HTTP range-request - blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], - file_cdx['c_size']) + blob, status = self.fetch_warc_content( + file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"] + ) if blob is None and status: - print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], - status['reason'])) - self.count['err-petabox-fetch'] += 1 + print( + "{}\terror petabox\t{}\t{}".format( + sha1_hex, file_cdx["warc"], status["reason"] + ) + ) + self.count["err-petabox-fetch"] += 1 continue elif not blob: print("{}\tskip-empty-blob".format(sha1_hex)) - self.count['skip-empty-blob'] += 1 + self.count["skip-empty-blob"] += 1 continue # verify sha1 if sha1_hex != hashlib.sha1(blob).hexdigest(): - #assert sha1_hex == hashlib.sha1(blob).hexdigest() - #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) + # assert sha1_hex == hashlib.sha1(blob).hexdigest() + # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) print("{}\terror petabox-hash-mismatch".format(sha1_hex)) - self.count['err-petabox-hash-mismatch'] += 1 + self.count["err-petabox-hash-mismatch"] += 1 - self.count['petabox-ok'] += 1 + self.count["petabox-ok"] += 1 # save to disk - fpath = "{}/{}{}/{}/{}{}".format(self.disk_dir, self.disk_prefix, sha1_hex[0:2], - sha1_hex[2:4], sha1_hex, self.disk_suffix) - with open(fpath, 'wb') as f: + fpath = "{}/{}{}/{}/{}{}".format( + self.disk_dir, + self.disk_prefix, + sha1_hex[0:2], + sha1_hex[2:4], + sha1_hex, + self.disk_suffix, + ) + with open(fpath, "wb") as f: f.write(blob) print("{}\tsuccess\t{}\t{}".format(sha1_hex, fpath, len(blob))) - self.count['success-disk'] += 1 + self.count["success-disk"] += 1 sys.stderr.write("{}\n".format(self.count)) @@ -143,31 +165,35 @@ class DeliverGwbDisk: def main(): parser = argparse.ArgumentParser() - parser.add_argument('--disk-dir', - required=True, - type=str, - help='local base directory to save into') - parser.add_argument('--disk-prefix', - type=str, - default="pdf/", - help='directory prefix for items created in bucket') - parser.add_argument('--disk-suffix', - type=str, - default=".pdf", - help='file suffix for created files') - parser.add_argument('--warc-uri-prefix', - type=str, - default='https://archive.org/serve/', - help='URI where WARCs can be found') - parser.add_argument('manifest_file', - help="TSV/JSON manifest file", - default=sys.stdin, - type=argparse.FileType('r')) + parser.add_argument( + "--disk-dir", required=True, type=str, help="local base directory to save into" + ) + parser.add_argument( + "--disk-prefix", + type=str, + default="pdf/", + help="directory prefix for items created in bucket", + ) + parser.add_argument( + "--disk-suffix", type=str, default=".pdf", help="file suffix for created files" + ) + parser.add_argument( + "--warc-uri-prefix", + type=str, + default="https://archive.org/serve/", + help="URI where WARCs can be found", + ) + parser.add_argument( + "manifest_file", + help="TSV/JSON manifest file", + default=sys.stdin, + type=argparse.FileType("r"), + ) args = parser.parse_args() worker = DeliverGwbDisk(**args.__dict__) worker.run(args.manifest_file) -if __name__ == '__main__': # pragma: no cover +if __name__ == "__main__": # pragma: no cover main() diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py index f9b3b19..1f08c4f 100755 --- a/python/scripts/deliver_gwb_to_s3.py +++ b/python/scripts/deliver_gwb_to_s3.py @@ -54,111 +54,128 @@ sentry_client = raven.Client() class DeliverGwbS3: def __init__(self, s3_bucket, **kwargs): - self.warc_uri_prefix = kwargs.get('warc_uri_prefix') + self.warc_uri_prefix = kwargs.get("warc_uri_prefix") self.rstore = None self.count = Counter() # /serve/ instead of /download/ doesn't record view count - self.petabox_base_url = kwargs.get('petabox_base_url', 'http://archive.org/serve/') + self.petabox_base_url = kwargs.get("petabox_base_url", "http://archive.org/serve/") # gwb library will fall back to reading from /opt/.petabox/webdata.secret - self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', - os.environ.get('PETABOX_WEBDATA_SECRET')) + self.petabox_webdata_secret = kwargs.get( + "petabox_webdata_secret", os.environ.get("PETABOX_WEBDATA_SECRET") + ) self.s3_bucket = s3_bucket - self.s3_prefix = kwargs.get('s3_prefix', 'pdf/') - self.s3_suffix = kwargs.get('s3_suffix', '.pdf') - self.s3 = boto3.resource('s3') + self.s3_prefix = kwargs.get("s3_prefix", "pdf/") + self.s3_suffix = kwargs.get("s3_suffix", ".pdf") + self.s3 = boto3.resource("s3") self.bucket = self.s3.Bucket(self.s3_bucket) def fetch_warc_content(self, warc_path, offset, c_size): warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: self.rstore = ResourceStore( - loaderfactory=CDXLoaderFactory(webdata_secret=self.petabox_webdata_secret, - download_base_url=self.petabox_base_url)) + loaderfactory=CDXLoaderFactory( + webdata_secret=self.petabox_webdata_secret, + download_base_url=self.petabox_base_url, + ) + ) try: gwb_record = self.rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: return None, dict( status="error", - reason="failed to load file contents from wayback/petabox (ResourceUnavailable)" + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)", ) except ValueError as ve: return None, dict( status="error", - reason="failed to load file contents from wayback/petabox (ValueError: {})". - format(ve)) + reason="failed to load file contents from wayback/petabox (ValueError: {})".format( + ve + ), + ) except EOFError as eofe: return None, dict( status="error", - reason="failed to load file contents from wayback/petabox (EOFError: {})". - format(eofe)) + reason="failed to load file contents from wayback/petabox (EOFError: {})".format( + eofe + ), + ) except TypeError as te: return None, dict( status="error", - reason= - "failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)" - .format(te)) + reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format( + te + ), + ) # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. if gwb_record.get_status()[0] != 200: - return None, dict(status="error", - reason="archived HTTP response (WARC) was not 200", - warc_status=gwb_record.get_status()[0]) + return None, dict( + status="error", + reason="archived HTTP response (WARC) was not 200", + warc_status=gwb_record.get_status()[0], + ) try: raw_content = gwb_record.open_raw_content().read() except IncompleteRead as ire: return None, dict( status="error", - reason= - "failed to read actual file contents from wayback/petabox (IncompleteRead: {})". - format(ire)) + reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format( + ire + ), + ) return raw_content, None def run(self, manifest_file): sys.stderr.write("Starting...\n") for line in manifest_file: - self.count['total'] += 1 - line = line.strip().split('\t') + self.count["total"] += 1 + line = line.strip().split("\t") if len(line) != 2: - self.count['skip-line'] += 1 + self.count["skip-line"] += 1 continue sha1_hex, cdx_json = line[0], line[1] assert len(sha1_hex) == 40 file_cdx = json.loads(cdx_json) # If warc is not item/file.(w)arc.gz form, skip it - if len(file_cdx['warc'].split('/')) != 2: - sys.stderr.write('WARC path not petabox item/file: {}'.format(file_cdx['warc'])) - print("{}\tskip warc\t{}".format(sha1_hex, file_cdx['warc'])) - self.count['skip-warc'] += 1 + if len(file_cdx["warc"].split("/")) != 2: + sys.stderr.write("WARC path not petabox item/file: {}".format(file_cdx["warc"])) + print("{}\tskip warc\t{}".format(sha1_hex, file_cdx["warc"])) + self.count["skip-warc"] += 1 continue # fetch from GWB/petabox via HTTP range-request - blob, status = self.fetch_warc_content(file_cdx['warc'], file_cdx['offset'], - file_cdx['c_size']) + blob, status = self.fetch_warc_content( + file_cdx["warc"], file_cdx["offset"], file_cdx["c_size"] + ) if blob is None and status: - print("{}\terror petabox\t{}\t{}".format(sha1_hex, file_cdx['warc'], - status['reason'])) - self.count['err-petabox-fetch'] += 1 + print( + "{}\terror petabox\t{}\t{}".format( + sha1_hex, file_cdx["warc"], status["reason"] + ) + ) + self.count["err-petabox-fetch"] += 1 continue elif not blob: print("{}\tskip-empty-blob".format(sha1_hex)) - self.count['skip-empty-blob'] += 1 + self.count["skip-empty-blob"] += 1 continue # verify sha1 if sha1_hex != hashlib.sha1(blob).hexdigest(): - #assert sha1_hex == hashlib.sha1(blob).hexdigest() - #sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) + # assert sha1_hex == hashlib.sha1(blob).hexdigest() + # sys.stderr.write("{}\terror petabox-mismatch\n".format(sha1_hex)) print("{}\terror petabox-hash-mismatch".format(sha1_hex)) - self.count['err-petabox-hash-mismatch'] += 1 + self.count["err-petabox-hash-mismatch"] += 1 - self.count['petabox-ok'] += 1 + self.count["petabox-ok"] += 1 # upload to AWS S3 - obj = self.bucket.put_object(Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], - sha1_hex, self.s3_suffix), - Body=blob) + obj = self.bucket.put_object( + Key="{}{}/{}{}".format(self.s3_prefix, sha1_hex[0:4], sha1_hex, self.s3_suffix), + Body=blob, + ) print("{}\tsuccess\t{}\t{}".format(sha1_hex, obj.key, len(blob))) - self.count['success-s3'] += 1 + self.count["success-s3"] += 1 sys.stderr.write("{}\n".format(self.count)) @@ -166,31 +183,32 @@ class DeliverGwbS3: def main(): parser = argparse.ArgumentParser() - parser.add_argument('--s3-bucket', - required=True, - type=str, - help='AWS S3 bucket to upload into') - parser.add_argument('--s3-prefix', - type=str, - default="pdf/", - help='key prefix for items created in bucket') - parser.add_argument('--s3-suffix', - type=str, - default=".pdf", - help='file suffix for created objects') - parser.add_argument('--warc-uri-prefix', - type=str, - default='https://archive.org/serve/', - help='URI where WARCs can be found') - parser.add_argument('manifest_file', - help="TSV/JSON manifest file", - default=sys.stdin, - type=argparse.FileType('r')) + parser.add_argument( + "--s3-bucket", required=True, type=str, help="AWS S3 bucket to upload into" + ) + parser.add_argument( + "--s3-prefix", type=str, default="pdf/", help="key prefix for items created in bucket" + ) + parser.add_argument( + "--s3-suffix", type=str, default=".pdf", help="file suffix for created objects" + ) + parser.add_argument( + "--warc-uri-prefix", + type=str, + default="https://archive.org/serve/", + help="URI where WARCs can be found", + ) + parser.add_argument( + "manifest_file", + help="TSV/JSON manifest file", + default=sys.stdin, + type=argparse.FileType("r"), + ) args = parser.parse_args() worker = DeliverGwbS3(**args.__dict__) worker.run(args.manifest_file) -if __name__ == '__main__': # pragma: no cover +if __name__ == "__main__": # pragma: no cover main() diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index 84a2c2c..67286b9 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -17,23 +17,21 @@ import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) - # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", "ncbi.nlm.nih.gov/", - #"semanticscholar.org/", + # "semanticscholar.org/", "://doi.org/", "zenodo.org/", "figshare.com/", "://archive.org/", ".archive.org/", - # large publishers/platforms; may remove in the future - #"://link.springer.com/", - #"://dergipark.gov.tr/", - #"frontiersin.org/", - #"scielo", + # "://link.springer.com/", + # "://dergipark.gov.tr/", + # "frontiersin.org/", + # "scielo", ] # these default to PDF; note that we also do pdf ingests for HTML pages @@ -63,35 +61,35 @@ def transform(obj: dict) -> List[dict]: Returns a list of dicts. """ - doaj_id = obj['id'].lower() + doaj_id = obj["id"].lower() assert doaj_id - bibjson = obj['bibjson'] - if not bibjson['link']: + bibjson = obj["bibjson"] + if not bibjson["link"]: return [] requests = [] doi: Optional[str] = None - for ident in (bibjson['identifier'] or []): - if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'): - doi = ident['id'].lower() + for ident in bibjson["identifier"] or []: + if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."): + doi = ident["id"].lower() - for link in (bibjson['link'] or []): - if link.get('type') != "fulltext" or not link.get('url'): + for link in bibjson["link"] or []: + if link.get("type") != "fulltext" or not link.get("url"): continue - ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower()) + ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower()) if not ingest_types: continue skip = False for domain in DOMAIN_BLOCKLIST: - if domain in link['url'].lower(): + if domain in link["url"].lower(): skip = True if skip: continue try: - base_url = canon(link['url'].strip()) + base_url = canon(link["url"].strip()) except UnicodeEncodeError: continue @@ -100,18 +98,18 @@ def transform(obj: dict) -> List[dict]: for ingest_type in ingest_types: request = { - 'base_url': base_url, - 'ingest_type': ingest_type, - 'link_source': 'doaj', - 'link_source_id': doaj_id, - 'ingest_request_source': 'doaj', - 'release_stage': 'published', - 'rel': 'publisher', - 'ext_ids': { - 'doi': doi, - 'doaj': doaj_id, + "base_url": base_url, + "ingest_type": ingest_type, + "link_source": "doaj", + "link_source_id": doaj_id, + "ingest_request_source": "doaj", + "release_stage": "published", + "rel": "publisher", + "ext_ids": { + "doi": doi, + "doaj": doaj_id, }, - 'edit_extra': {}, + "edit_extra": {}, } requests.append(request) @@ -131,9 +129,9 @@ def run(args) -> None: def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="DOAJ article dump file to use", - type=argparse.FileType('r')) + parser.add_argument( + "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() @@ -141,5 +139,5 @@ def main() -> None: run(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py index 54c3d5f..44c091c 100755 --- a/python/scripts/enrich_scored_matches.py +++ b/python/scripts/enrich_scored_matches.py @@ -24,23 +24,25 @@ import sys def run(): for line in sys.stdin: - line = line.split('\t') + line = line.split("\t") assert len(line) == 5 - raw_sha1 = line[0].replace('sha1:', '') + raw_sha1 = line[0].replace("sha1:", "") dois = json.loads(line[1]) cdx = json.loads(line[2]) mimetype = line[3] size = int(line[4]) - sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower() + sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower() - obj = dict(sha1=sha1, - dois=dois, - cdx=[dict(url=cdx['url'], dt=cdx['dt'])], - size=size, - mimetype=mimetype) + obj = dict( + sha1=sha1, + dois=dois, + cdx=[dict(url=cdx["url"], dt=cdx["dt"])], + size=size, + mimetype=mimetype, + ) print(json.dumps(obj)) -if __name__ == '__main__': +if __name__ == "__main__": run() diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index a474393..8fce0d9 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -3,39 +3,41 @@ import json import sys -with open('title_slug_denylist.txt', 'r') as f: +with open("title_slug_denylist.txt", "r") as f: TITLE_DENYLIST = [l.strip() for l in f] -TITLE_DENYLIST.extend(( - 'editorial', - 'advertisement', - 'bookreviews', - 'reviews', - 'nr', - 'abstractoriginalarticle', - 'originalarticle', - 'impactfactor', - 'articlenumber', -)) +TITLE_DENYLIST.extend( + ( + "editorial", + "advertisement", + "bookreviews", + "reviews", + "nr", + "abstractoriginalarticle", + "originalarticle", + "impactfactor", + "articlenumber", + ) +) # The full name can't *entirely* be one of these NAME_DENYLIST = ( - 'phd', - 'phdstudent', + "phd", + "phdstudent", ) def tokenize(s, remove_whitespace=True): - s.replace(''', "'") + s.replace("'", "'") # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()]) + s = "".join([c for c in s.lower() if c.isalpha() or c.isspace()]) if remove_whitespace: - s = ''.join(s.split()) + s = "".join(s.split()) # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').decode('utf8').replace('?', '') + return s.encode("ascii", "replace").decode("utf8").replace("?", "") assert tokenize("Impact Factor: 2.114") == "impactfactor" @@ -50,14 +52,14 @@ def filter_title(title): title_slug = tokenize(title, remove_whitespace=True) if len(title_slug) < 10 or title_slug in TITLE_DENYLIST: return None - if title_slug.startswith('nr'): + if title_slug.startswith("nr"): return None - if title.lower().replace('.', '').startswith('int j '): + if title.lower().replace(".", "").startswith("int j "): return None for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "): if title.startswith(prefix): - title.replace(prefix, '') + title.replace(prefix, "") if title.startswith("The Journal of "): return None @@ -81,17 +83,17 @@ def filter_title(title): return None # too deep subtitling/splitting - if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1: + if title.count(":") > 3 or title.count("|") > 1 or title.count(".") > 1: return None return title def filter_author_name(name): - name = name['name'] - if name.strip().lower().replace(' ', '') in NAME_DENYLIST: + name = name["name"] + if name.strip().lower().replace(" ", "") in NAME_DENYLIST: return None - return ' '.join([t for t in name.split() if tokenize(t)]) + return " ".join([t for t in name.split() if tokenize(t)]) def filter_authors(l): @@ -107,45 +109,58 @@ def filter_journal_name(name): # same denylist, for now if not name: return None - name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') + name = name.replace(" e-ISSN", "").replace(" p-ISSN", "") slug_name = tokenize(name) if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º": return None - for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", - "Research Article ", "Available online www.jocpr.com "): + for prefix in ( + "/ ", + "~ ", + "& ", + "© ", + "Original Research Article ", + "Original Article ", + "Research Article ", + "Available online www.jocpr.com ", + ): if name.startswith(prefix): - name = name.replace(prefix, '') - for suffix in (" Available online at www.sciarena.com", " Original Article", - " Available online at", " ISSN", " ISSUE"): + name = name.replace(prefix, "") + for suffix in ( + " Available online at www.sciarena.com", + " Original Article", + " Available online at", + " ISSN", + " ISSUE", + ): if name.endswith(suffix): - name = name.replace(suffix, '') + name = name.replace(suffix, "") if "====================" in name: return None if len(name) > 150: return None - return ' '.join(name.split()) + return " ".join(name.split()) def filter_metadata(obj): - if not (obj.get('title') and obj.get('authors')): + if not (obj.get("title") and obj.get("authors")): return None - title = filter_title(obj['title']) + title = filter_title(obj["title"]) if not title: - #sys.stderr.write("bad title\n") + # sys.stderr.write("bad title\n") return None else: - obj['title'] = title - obj['authors'] = filter_authors(obj['authors']) - obj['citations'] = filter_refs(obj['citations']) - obj['journal']['name'] = filter_journal_name(obj['journal']['name']) + obj["title"] = title + obj["authors"] = filter_authors(obj["authors"]) + obj["citations"] = filter_refs(obj["citations"]) + obj["journal"]["name"] = filter_journal_name(obj["journal"]["name"]) return obj def run(invert=False): for line in sys.stdin: - fields = line.split('\t') + fields = line.split("\t") if len(fields) == 5: raw = fields[4] elif len(fields) == 1: @@ -162,7 +177,7 @@ def run(invert=False): fields[4] = processed else: fields[0] = processed - print('\t'.join(fields)) + print("\t".join(fields)) elif invert: print(raw.strip()) diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py index fda9098..87dae16 100755 --- a/python/scripts/filter_groupworks.py +++ b/python/scripts/filter_groupworks.py @@ -31,15 +31,15 @@ REQUIRE_AUTHORS = False def tokenize(s, remove_whitespace=False): - s.replace(''', "'") + s.replace("'", "'") # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) + s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()]) if remove_whitespace: - s = ''.join(s.split()) + s = "".join(s.split()) # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').replace(b'?', b'') + return s.encode("ascii", "replace").replace(b"?", b"") def check_authors(left, right): @@ -53,7 +53,7 @@ def check_authors(left, right): return False right_all = tokenize(" ".join(right)) for i in range(len(left)): - l = left[i].lower().replace('jr.', '').split() + l = left[i].lower().replace("jr.", "").split() if not l: return False l = tokenize(l[-1]) @@ -61,21 +61,21 @@ def check_authors(left, right): # weird author name (single char) return False if l not in right_all: - #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) + # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) return False return True def test_check_authors(): assert check_authors([], []) == bool(not REQUIRE_AUTHORS) - assert not check_authors([], ['one']) - assert check_authors(['one'], ['one']) - assert check_authors(['one two'], ['One Two']) - assert check_authors(['two'], ['One Two']) - assert check_authors(['two'], ['two, one']) - assert check_authors(['mago'], ['Mr. Magoo']) - assert check_authors(['Mr. Magoo'], ['Mr Magoo']) - assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + assert not check_authors([], ["one"]) + assert check_authors(["one"], ["one"]) + assert check_authors(["one two"], ["One Two"]) + assert check_authors(["two"], ["One Two"]) + assert check_authors(["two"], ["two, one"]) + assert check_authors(["mago"], ["Mr. Magoo"]) + assert check_authors(["Mr. Magoo"], ["Mr Magoo"]) + assert check_authors(["one", "tw", "thr"], ["one", "two", "three"]) # Rows are (score, left, right) @@ -90,10 +90,10 @@ def process_group(rows): left = json.loads(row[1]) right = json.loads(row[2]) # authors must roughly match - if not check_authors(left['authors'], right['authors']): + if not check_authors(left["authors"], right["authors"]): continue # years must match (if defined) - if left['year'] and right['year'] and left['year'] != right['year']: + if left["year"] and right["year"] and left["year"] != right["year"]: continue filtered.append((left, right)) @@ -105,8 +105,8 @@ def process_group(rows): group_ids = set() for row in filtered[1:]: (left, right) = row - l_id = left['fatcat_release'] - r_id = right['fatcat_release'] + l_id = left["fatcat_release"] + r_id = right["fatcat_release"] releases[l_id] = left releases[r_id] = right if not group_ids: @@ -131,7 +131,7 @@ def run(): # group lines by slug, and process in batches for line in sys.stdin: - line = line.strip().split('\t') + line = line.strip().split("\t") assert len(line) == 4 slug = line[0] if last_slug and slug != last_slug and lines: @@ -146,5 +146,5 @@ def run(): process_group(lines) -if __name__ == '__main__': +if __name__ == "__main__": run() diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py index 3251852..c5b7eef 100755 --- a/python/scripts/filter_scored_matches.py +++ b/python/scripts/filter_scored_matches.py @@ -23,15 +23,15 @@ require_authors = 1 def tokenize(s, remove_whitespace=False): - s.replace(''', "'") + s.replace("'", "'") # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) + s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()]) if remove_whitespace: - s = ''.join(s.split()) + s = "".join(s.split()) # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').replace(b'?', b'') + return s.encode("ascii", "replace").replace(b"?", b"") def check_authors(left, right): @@ -45,7 +45,7 @@ def check_authors(left, right): return False right_all = tokenize(" ".join(right)) for i in range(len(left)): - l = left[i].lower().replace('jr.', '').split() + l = left[i].lower().replace("jr.", "").split() if not l: return False l = tokenize(l[-1]) @@ -53,21 +53,21 @@ def check_authors(left, right): # weird author name (single char) return False if l not in right_all: - #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) + # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) return False return True def test_check_authors(): assert not check_authors([], []) - assert not check_authors([], ['one']) - assert check_authors(['one'], ['one']) - assert check_authors(['one two'], ['One Two']) - assert check_authors(['two'], ['One Two']) - assert check_authors(['two'], ['two, one']) - assert check_authors(['mago'], ['Mr. Magoo']) - assert check_authors(['Mr. Magoo'], ['Mr Magoo']) - assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + assert not check_authors([], ["one"]) + assert check_authors(["one"], ["one"]) + assert check_authors(["one two"], ["One Two"]) + assert check_authors(["two"], ["One Two"]) + assert check_authors(["two"], ["two, one"]) + assert check_authors(["mago"], ["Mr. Magoo"]) + assert check_authors(["Mr. Magoo"], ["Mr Magoo"]) + assert check_authors(["one", "tw", "thr"], ["one", "two", "three"]) # Rows are (score, grobid, crossref) @@ -81,14 +81,14 @@ def process_group(rows): continue grobid = json.loads(row[1]) crossref = json.loads(row[2]) - if not check_authors(crossref['authors'], grobid['authors']): - #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) + if not check_authors(crossref["authors"], grobid["authors"]): + # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) continue else: - #print("YES: {} {}".format(crossref['authors'], grobid['authors'])) + # print("YES: {} {}".format(crossref['authors'], grobid['authors'])) pass - sha1 = grobid['sha1'] - doi = crossref['doi'].lower() + sha1 = grobid["sha1"] + doi = crossref["doi"].lower() l = keepers.get(sha1, list()) l.append(doi) keepers[sha1] = l @@ -103,7 +103,7 @@ def run(): # group lines by slug, and process in batches for line in sys.stdin: - line = line.strip().split('\t') + line = line.strip().split("\t") assert len(line) == 4 slug = line[0] if last_slug and slug != last_slug and lines: @@ -117,5 +117,5 @@ def run(): process_group(lines) -if __name__ == '__main__': +if __name__ == "__main__": run() diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py index b42153c..b01e46a 100755 --- a/python/scripts/grobid_affiliations.py +++ b/python/scripts/grobid_affiliations.py @@ -16,40 +16,40 @@ from grobid2json import teixml2json def parse_hbase(line): - line = line.split('\t') + line = line.split("\t") assert len(line) == 2 sha1hex = line[0] obj = json.loads(line[1]) - tei_xml = obj['tei_xml'] + tei_xml = obj["tei_xml"] return sha1hex, tei_xml def parse_pg(line): obj = json.loads(line) - return obj['sha1hex'], obj['tei_xml'] + return obj["sha1hex"], obj["tei_xml"] -def run(mode='hbase'): +def run(mode="hbase"): for line in sys.stdin: - if mode == 'hbase': + if mode == "hbase": sha1hex, tei_xml = parse_hbase(line) - elif mode == 'pg': + elif mode == "pg": sha1hex, tei_xml = parse_pg(line) else: - raise NotImplementedError('parse mode: {}'.format(mode)) + raise NotImplementedError("parse mode: {}".format(mode)) obj = teixml2json(tei_xml, encumbered=False) affiliations = [] - for author in obj['authors']: - if author.get('affiliation'): - affiliations.append(author['affiliation']) + for author in obj["authors"]: + if author.get("affiliation"): + affiliations.append(author["affiliation"]) if affiliations: # don't duplicate affiliations; only the unique ones affiliations = list(set([json.dumps(a) for a in affiliations])) affiliations = [json.loads(a) for a in affiliations] - print('\t'.join([sha1hex, json.dumps(affiliations)])) + print("\t".join([sha1hex, json.dumps(affiliations)])) -if __name__ == '__main__': +if __name__ == "__main__": run() diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py index c9bc134..f941881 100755 --- a/python/scripts/import_grobid_metadata.py +++ b/python/scripts/import_grobid_metadata.py @@ -9,59 +9,59 @@ MAX_ABSTRACT_BYTES = 4096 def parse_grobid_json(obj): - if not obj.get('title'): + if not obj.get("title"): return None extra = dict() - if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: - abobj = dict(mimetype="text/plain", language=None, content=obj.get('abstract').strip()) + if obj.get("abstract") and len(obj.get("abstract")) < MAX_ABSTRACT_BYTES: + abobj = dict(mimetype="text/plain", language=None, content=obj.get("abstract").strip()) abstracts = [abobj] else: abstracts = None contribs = [] - for a in obj.get('authors', []): + for a in obj.get("authors", []): c = dict(raw_name=a, role="author") contribs.append(c) refs = [] - for raw in obj.get('citations', []): + for raw in obj.get("citations", []): extra = dict() ref = dict() - ref['key'] = raw.get('id') - if raw.get('title'): - ref['title'] = raw['title'].strip() - if raw.get('date'): + ref["key"] = raw.get("id") + if raw.get("title"): + ref["title"] = raw["title"].strip() + if raw.get("date"): try: - year = int(raw['date'].strip()[:4]) - ref['year'] = year + year = int(raw["date"].strip()[:4]) + ref["year"] = year except: pass - for key in ('volume', 'url', 'issue', 'publisher'): + for key in ("volume", "url", "issue", "publisher"): if raw.get(key): extra[key] = raw[key].strip() - if raw.get('authors'): - extra['authors'] = [a['name'] for a in raw['authors']] + if raw.get("authors"): + extra["authors"] = [a["name"] for a in raw["authors"]] if extra: extra = dict(grobid=extra) else: extra = None - ref['extra'] = extra + ref["extra"] = extra refs.append(ref) release_type = "journal-article" release_date = None - if obj.get('date'): + if obj.get("date"): # TODO: only returns year, ever? how to handle? - release_date = datetime.datetime(year=obj['date'], month=1, day=1) + release_date = datetime.datetime(year=obj["date"], month=1, day=1) - if obj.get('doi'): - extra['doi'] = obj['doi'].lower() - if obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] + if obj.get("doi"): + extra["doi"] = obj["doi"].lower() + if obj["journal"].get("name"): + extra["container_name"] = obj["journal"]["name"] - extra['is_longtail_oa'] = True + extra["is_longtail_oa"] = True # TODO: ISSN/eISSN handling? or just journal name lookup? @@ -70,15 +70,17 @@ def parse_grobid_json(obj): else: extra = None - return dict(title=obj['title'].strip(), - contribs=contribs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), - abstracts=abstracts, - release_type=release_type, - release_date=release_date, - extra=extra) + return dict( + title=obj["title"].strip(), + contribs=contribs, + publisher=obj["journal"].get("publisher"), + volume=obj["journal"].get("volume"), + issue=obj["journal"].get("issue"), + abstracts=abstracts, + release_type=release_type, + release_date=release_date, + extra=extra, + ) def run(): diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py index 70731d5..d52e793 100755 --- a/python/scripts/ingestrequest_row2json.py +++ b/python/scripts/ingestrequest_row2json.py @@ -15,13 +15,13 @@ def transform(row): """ dict-to-dict """ - row.pop('created', None) - extra = row.pop('request', None) or {} - for k in ('ext_ids', 'edit_extra'): + row.pop("created", None) + extra = row.pop("request", None) or {} + for k in ("ext_ids", "edit_extra"): if k in extra: row[k] = extra[k] - if 'release_ident' in extra: - row['fatcat'] = dict(release_ident=extra['release_ident']) + if "release_ident" in extra: + row["fatcat"] = dict(release_ident=extra["release_ident"]) return row @@ -38,9 +38,9 @@ def run(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="arabesque output file to use", - type=argparse.FileType('r')) + parser.add_argument( + "json_file", help="arabesque output file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() @@ -48,5 +48,5 @@ def main(): run(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py index 1f4a19f..9607b85 100755 --- a/python/scripts/oai2ingestrequest.py +++ b/python/scripts/oai2ingestrequest.py @@ -13,7 +13,6 @@ import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) - # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", @@ -26,17 +25,16 @@ DOMAIN_BLOCKLIST = [ "://archive.org/", ".archive.org/", "://127.0.0.1/", - # OAI specific additions "://hdl.handle.net/", ] RELEASE_STAGE_MAP = { - 'info:eu-repo/semantics/draftVersion': 'draft', - 'info:eu-repo/semantics/submittedVersion': 'submitted', - 'info:eu-repo/semantics/acceptedVersion': 'accepted', - 'info:eu-repo/semantics/publishedVersion': 'published', - 'info:eu-repo/semantics/updatedVersion': 'updated', + "info:eu-repo/semantics/draftVersion": "draft", + "info:eu-repo/semantics/submittedVersion": "submitted", + "info:eu-repo/semantics/acceptedVersion": "accepted", + "info:eu-repo/semantics/publishedVersion": "published", + "info:eu-repo/semantics/updatedVersion": "updated", } @@ -52,38 +50,38 @@ def transform(obj): """ requests = [] - if not obj.get('oai') or not obj['oai'].startswith('oai:'): + if not obj.get("oai") or not obj["oai"].startswith("oai:"): return [] - if not obj.get('urls'): + if not obj.get("urls"): return [] # look in obj['formats'] for PDF? - if obj.get('formats'): + if obj.get("formats"): # if there is a list of formats, and it does not contain PDF, then # skip. Note that we will continue if there is no formats list. has_pdf = False - for f in obj['formats']: - if 'pdf' in f.lower(): + for f in obj["formats"]: + if "pdf" in f.lower(): has_pdf = True if not has_pdf: return [] doi = None - if obj.get('doi'): - doi = obj['doi'][0].lower().strip() - if not doi.startswith('10.'): + if obj.get("doi"): + doi = obj["doi"][0].lower().strip() + if not doi.startswith("10."): doi = None # infer release stage and/or type from obj['types'] release_stage = None - for t in obj.get('types', []): + for t in obj.get("types", []): if t in RELEASE_STAGE_MAP: release_stage = RELEASE_STAGE_MAP[t] # TODO: infer rel somehow? Eg, repository vs. OJS publisher rel = None - for url in obj['urls']: + for url in obj["urls"]: skip = False for domain in DOMAIN_BLOCKLIST: if domain in url: @@ -96,18 +94,18 @@ def transform(obj): continue request = { - 'base_url': base_url, - 'ingest_type': 'pdf', - 'link_source': 'oai', - 'link_source_id': obj['oai'].lower(), - 'ingest_request_source': 'metha-bulk', - 'release_stage': release_stage, - 'rel': rel, - 'ext_ids': { - 'doi': doi, - 'oai': obj['oai'].lower(), + "base_url": base_url, + "ingest_type": "pdf", + "link_source": "oai", + "link_source_id": obj["oai"].lower(), + "ingest_request_source": "metha-bulk", + "release_stage": release_stage, + "rel": rel, + "ext_ids": { + "doi": doi, + "oai": obj["oai"].lower(), }, - 'edit_extra': {}, + "edit_extra": {}, } requests.append(request) @@ -127,9 +125,11 @@ def run(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="OAI-PMH dump file to use (usually stdin)", - type=argparse.FileType('r')) + parser.add_argument( + "json_file", + help="OAI-PMH dump file to use (usually stdin)", + type=argparse.FileType("r"), + ) subparsers = parser.add_subparsers() args = parser.parse_args() @@ -137,5 +137,5 @@ def main(): run(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py index 3f81b3b..8b57c5b 100755 --- a/python/scripts/pdf_thumbnail.py +++ b/python/scripts/pdf_thumbnail.py @@ -22,15 +22,16 @@ def run(inpath, outpath): renderer = poppler.PageRenderer() full_page = renderer.render_page(page) - img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', - "BGRA", 0, 1) + img = Image.frombuffer( + "RGBA", (full_page.width, full_page.height), full_page.data, "raw", "BGRA", 0, 1 + ) img.thumbnail((180, 300), Image.BICUBIC) - #img.thumbnail((360,600), Image.BICUBIC) + # img.thumbnail((360,600), Image.BICUBIC) img.save(outpath) - #img.save(outpath, quality=95) + # img.save(outpath, quality=95) -if __name__ == '__main__': +if __name__ == "__main__": if len(sys.argv) != 3: print("expect two parameters: INPUT.png OUTPUT.png", file=sys.stderr) sys.exit(-1) diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index b79f316..ad5353b 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -11,7 +11,6 @@ import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) - # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", @@ -25,11 +24,11 @@ DOMAIN_BLOCKLIST = [ ] RELEASE_STAGE_MAP = { - 'draftVersion': 'draft', - 'submittedVersion': 'submitted', - 'acceptedVersion': 'accepted', - 'publishedVersion': 'published', - 'updatedVersion': 'updated', + "draftVersion": "draft", + "submittedVersion": "submitted", + "acceptedVersion": "accepted", + "publishedVersion": "published", + "updatedVersion": "updated", } @@ -45,44 +44,44 @@ def transform(obj): """ requests = [] - if not obj['doi'].startswith('10.'): + if not obj["doi"].startswith("10."): return requests - if not obj['oa_locations']: + if not obj["oa_locations"]: return requests - for location in obj['oa_locations']: - if not location['url_for_pdf']: + for location in obj["oa_locations"]: + if not location["url_for_pdf"]: continue skip = False for domain in DOMAIN_BLOCKLIST: - if domain in location['url_for_pdf']: + if domain in location["url_for_pdf"]: skip = True if skip: continue try: - base_url = canon(location['url_for_pdf']) + base_url = canon(location["url_for_pdf"]) except UnicodeEncodeError: continue request = { - 'base_url': base_url, - 'ingest_type': 'pdf', - 'link_source': 'unpaywall', - 'link_source_id': obj['doi'].lower(), - 'ingest_request_source': 'unpaywall', - 'release_stage': RELEASE_STAGE_MAP.get(location['version']), - 'rel': location['host_type'], - 'ext_ids': { - 'doi': obj['doi'].lower(), + "base_url": base_url, + "ingest_type": "pdf", + "link_source": "unpaywall", + "link_source_id": obj["doi"].lower(), + "ingest_request_source": "unpaywall", + "release_stage": RELEASE_STAGE_MAP.get(location["version"]), + "rel": location["host_type"], + "ext_ids": { + "doi": obj["doi"].lower(), }, - 'edit_extra': {}, + "edit_extra": {}, } - if obj.get('oa_status'): - request['edit_extra']['oa_status'] = obj['oa_status'] - if location.get('evidence'): - request['edit_extra']['evidence'] = location['evidence'] - if location['pmh_id']: - request['ext_ids']['pmh_id'] = location['pmh_id'] + if obj.get("oa_status"): + request["edit_extra"]["oa_status"] = obj["oa_status"] + if location.get("evidence"): + request["edit_extra"]["evidence"] = location["evidence"] + if location["pmh_id"]: + request["ext_ids"]["pmh_id"] = location["pmh_id"] requests.append(request) return requests @@ -101,9 +100,9 @@ def run(args): def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="unpaywall dump file to use", - type=argparse.FileType('r')) + parser.add_argument( + "json_file", help="unpaywall dump file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() @@ -111,5 +110,5 @@ def main(): run(args) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py index 15d43fb..c086d73 100644 --- a/python/tests/test_grobid.py +++ b/python/tests/test_grobid.py @@ -8,13 +8,15 @@ from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) -with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f: +with open("tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml", "rb") as f: REAL_TEI_XML = f.read() @pytest.fixture def grobid_client(): - client = GrobidClient(host_url="http://dummy-grobid", ) + client = GrobidClient( + host_url="http://dummy-grobid", + ) return client @@ -22,39 +24,43 @@ def grobid_client(): def test_grobid_503(grobid_client): status = b'{"status": "done broke due to 503"}' - responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', - status=503, - body=status) + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=503, + body=status, + ) resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) # grobid gets POST 1x times assert len(responses.calls) == 1 - assert resp['status_code'] == 503 - assert resp['status'] == "error" + assert resp["status_code"] == 503 + assert resp["status"] == "error" @responses.activate def test_grobid_success(grobid_client): - responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', - status=200, - body=REAL_TEI_XML, - content_type='text/xml') + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=200, + body=REAL_TEI_XML, + content_type="text/xml", + ) resp = grobid_client.process_fulltext(FAKE_PDF_BYTES) # grobid gets POST 1x times assert len(responses.calls) == 1 - assert resp['status_code'] == 200 - assert resp['status'] == "success" - #print(type(resp['tei_xml'])) - #print(type(REAL_TEI_XML)) - assert resp['tei_xml'] == REAL_TEI_XML.decode('ISO-8859-1') + assert resp["status_code"] == 200 + assert resp["status"] == "success" + # print(type(resp['tei_xml'])) + # print(type(REAL_TEI_XML)) + assert resp["tei_xml"] == REAL_TEI_XML.decode("ISO-8859-1") @responses.activate @@ -63,22 +69,24 @@ def test_grobid_worker_cdx(grobid_client, wayback_client): # noqa: F811 sink = BlackholeSink() worker = GrobidWorker(grobid_client, wayback_client, sink=sink) - responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', - status=200, - body=REAL_TEI_XML, - content_type='text/xml') + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=200, + body=REAL_TEI_XML, + content_type="text/xml", + ) - with open('tests/files/example.cdx', 'r') as cdx_file: + with open("tests/files/example.cdx", "r") as cdx_file: pusher = CdxLinePusher( worker, cdx_file, filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], + filter_mimetypes=["application/pdf"], ) pusher_counts = pusher.run() - assert pusher_counts['total'] - assert pusher_counts['pushed'] == 7 - assert pusher_counts['pushed'] == worker.counts['total'] + assert pusher_counts["total"] + assert pusher_counts["pushed"] == 7 + assert pusher_counts["pushed"] == worker.counts["total"] - assert len(responses.calls) == worker.counts['total'] + assert len(responses.calls) == worker.counts["total"] diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py index 7637871..98888e8 100644 --- a/python/tests/test_grobid2json.py +++ b/python/tests/test_grobid2json.py @@ -8,9 +8,9 @@ from grobid2json import * def test_small_xml(): - with open('tests/files/small.xml', 'r') as f: + with open("tests/files/small.xml", "r") as f: tei_xml = f.read() - with open('tests/files/small.json', 'r') as f: + with open("tests/files/small.json", "r") as f: json_form = json.loads(f.read()) assert teixml2json(tei_xml) == json_form diff --git a/python/tests/test_html.py b/python/tests/test_html.py index 1caca15..614b802 100644 --- a/python/tests/test_html.py +++ b/python/tests/test_html.py @@ -7,7 +7,8 @@ def test_extract_fulltext_url(): assert resp == {} resp = extract_fulltext_url( - "http://dummy-site/", b"""<html> + "http://dummy-site/", + b"""<html> <head> <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf"> </head> @@ -15,14 +16,17 @@ def test_extract_fulltext_url(): <h1>my big article here</h1> blah </body> - </html>""") - assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf" - assert resp['technique'] == "citation_pdf_url" + </html>""", + ) + assert resp["pdf_url"] == "http://www.example.com/content/271/20/11761.full.pdf" + assert resp["technique"] == "citation_pdf_url" - with open('tests/files/plos_one_article.html', 'rb') as f: + with open("tests/files/plos_one_article.html", "rb") as f: resp = extract_fulltext_url( "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978", f.read(), ) - assert resp[ - 'pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" + assert ( + resp["pdf_url"] + == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" + ) diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py index 727fef9..ba4acf1 100644 --- a/python/tests/test_html_ingest.py +++ b/python/tests/test_html_ingest.py @@ -3,8 +3,8 @@ from sandcrawler.ingest_html import * def test_html_extract_ojs3() -> None: - with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f: + with open("tests/files/first_monday_ojs3_fulltext.html", "rb") as f: ojs3_html = f.read() fulltext = html_extract_body_teixml(ojs3_html) - assert fulltext['status'] == 'success' + assert fulltext["status"] == "success" diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py index a4c1e41..69bd211 100644 --- a/python/tests/test_html_metadata.py +++ b/python/tests/test_html_metadata.py @@ -7,14 +7,20 @@ from sandcrawler.html_metadata import * def test_html_metadata_plos() -> None: - with open('tests/files/plos_one_article.html', 'r') as f: + with open("tests/files/plos_one_article.html", "r") as f: plos_html = f.read() meta = html_extract_biblio("http://example.org", HTMLParser(plos_html)) assert meta is not None - assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" + assert ( + meta.title + == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody" + ) assert meta.doi == "10.1371/journal.pone.0213978" - assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" + assert ( + meta.pdf_fulltext_url + == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" + ) assert meta.contrib_names == [ "Yang Li", "Tuanjie Wang", @@ -37,18 +43,26 @@ def test_html_metadata_plos() -> None: assert meta.volume == "14" assert meta.container_issn == "1932-6203" assert meta.publisher == "Public Library of Science" - assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references + assert ( + meta.raw_references + and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" + in meta.raw_references + ) assert meta.release_type == "article-journal" - assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" + assert ( + meta.pdf_fulltext_url + == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" + ) def test_html_metadata_elife() -> None: - with open('tests/files/elife_article.html', 'r') as f: + with open("tests/files/elife_article.html", "r") as f: elife_html = f.read() - meta = html_extract_biblio("https://elifesciences.org/articles/44753", - HTMLParser(elife_html)) + meta = html_extract_biblio( + "https://elifesciences.org/articles/44753", HTMLParser(elife_html) + ) assert meta is not None assert meta.title == "Parallel visual circuitry in a basal chordate" assert meta.doi == "10.7554/eLife.44753" @@ -65,17 +79,23 @@ def test_html_metadata_elife() -> None: # 2019-04-18 assert meta.release_date == datetime.date(year=2019, month=4, day=18) assert meta.publisher == "eLife Sciences Publications Limited" - assert meta.pdf_fulltext_url == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D" + assert ( + meta.pdf_fulltext_url + == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D" + ) def test_html_metadata_peerj() -> None: - with open('tests/files/peerj_oa_article.html', 'r') as f: + with open("tests/files/peerj_oa_article.html", "r") as f: peerj_html = f.read() meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html)) assert meta is not None - assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles" + assert ( + meta.title + == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles" + ) assert meta.doi == "10.7717/peerj.4375" assert meta.contrib_names == [ "Heather Piwowar", @@ -96,7 +116,7 @@ def test_html_metadata_peerj() -> None: def test_html_metadata_nature() -> None: - with open('tests/files/nature_article.html', 'r') as f: + with open("tests/files/nature_article.html", "r") as f: nature_html = f.read() meta = html_extract_biblio("http://example.org", HTMLParser(nature_html)) @@ -111,12 +131,15 @@ def test_html_metadata_nature() -> None: assert meta.release_date == datetime.date(year=2020, month=9, day=10) assert meta.publisher == "Nature Publishing Group" # note: some error in dublin code in nature HTML resulting in duplication - assert meta.abstract == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk." + assert ( + meta.abstract + == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk. Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk." + ) def test_html_metadata_ojs3() -> None: - with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f: + with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f: ojs3_html = f.read() meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html)) @@ -134,14 +157,20 @@ def test_html_metadata_ojs3() -> None: # "2020/09/10" assert meta.release_date == datetime.date(year=2020, month=9, day=10) assert meta.lang == "en" - assert meta.abstract == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate." - assert meta.html_fulltext_url == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729" + assert ( + meta.abstract + == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate." + ) + assert ( + meta.html_fulltext_url + == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729" + ) assert meta.release_type == "article-journal" def test_html_metadata_dlib() -> None: - with open('tests/files/dlib_05vanhyning.html', 'r') as f: + with open("tests/files/dlib_05vanhyning.html", "r") as f: dlib_html = f.read() meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html)) @@ -176,7 +205,7 @@ def adblock() -> Any: def test_html_resources(adblock) -> None: - with open('tests/files/dlib_05vanhyning.html', 'r') as f: + with open("tests/files/dlib_05vanhyning.html", "r") as f: dlib_html = f.read() resources = html_extract_resources( @@ -189,9 +218,9 @@ def test_html_resources(adblock) -> None: # check that adblock working for r in resources: - assert '/ga.js' not in r['url'] + assert "/ga.js" not in r["url"] - with open('tests/files/plos_one_article.html', 'r') as f: + with open("tests/files/plos_one_article.html", "r") as f: plos_html = f.read() resources = html_extract_resources( @@ -202,9 +231,9 @@ def test_html_resources(adblock) -> None: # check that custom adblock working for r in resources: - assert 'crossmark-cdn.crossref.org' not in r['url'] + assert "crossmark-cdn.crossref.org" not in r["url"] - with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f: + with open("tests/files/first_monday_ojs3_landingpage.html", "r") as f: monday_html = f.read() resources = html_extract_resources( @@ -213,7 +242,7 @@ def test_html_resources(adblock) -> None: adblock, ) - with open('tests/files/elife_article.html', 'r') as f: + with open("tests/files/elife_article.html", "r") as f: elife_html = f.read() resources = html_extract_resources( @@ -222,7 +251,7 @@ def test_html_resources(adblock) -> None: adblock, ) - with open('tests/files/nature_article.html', 'r') as f: + with open("tests/files/nature_article.html", "r") as f: nature_html = f.read() resources = html_extract_resources( diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 617f2b4..ad8c22e 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -11,7 +11,9 @@ from sandcrawler import * @pytest.fixture def ingest_worker(wayback_client, spn_client): - grobid_client = GrobidClient(host_url="http://dummy-grobid", ) + grobid_client = GrobidClient( + host_url="http://dummy-grobid", + ) worker = IngestFileWorker( wayback_client=wayback_client, spn_client=spn_client, @@ -22,8 +24,12 @@ def ingest_worker(wayback_client, spn_client): @pytest.fixture def ingest_worker_pdf(wayback_client_pdf, spn_client): - grobid_client = GrobidClient(host_url="http://dummy-grobid", ) - pgrest_client = SandcrawlerPostgrestClient(api_url="http://dummy-postgrest", ) + grobid_client = GrobidClient( + host_url="http://dummy-grobid", + ) + pgrest_client = SandcrawlerPostgrestClient( + api_url="http://dummy-postgrest", + ) worker = IngestFileWorker( wayback_client=wayback_client_pdf, spn_client=spn_client, @@ -36,182 +42,197 @@ def ingest_worker_pdf(wayback_client_pdf, spn_client): @responses.activate def test_ingest_success(ingest_worker_pdf): - with open('tests/files/dummy.pdf', 'rb') as f: + with open("tests/files/dummy.pdf", "rb") as f: pdf_bytes = f.read() request = { - 'ingest_type': 'pdf', - 'base_url': "http://dummy-host/", + "ingest_type": "pdf", + "base_url": "http://dummy-host/", } - responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({ - "url": TARGET, - "job_id": JOB_ID - })) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) - responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SPN_HIT)) - responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", - TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body=pdf_bytes) - responses.add(responses.GET, - 'http://dummy-postgrest/grobid?sha1hex=eq.{}'.format( - "90ffd2359008d82298821d16b21778c5c39aec36"), - status=200, - body=json.dumps([])) - responses.add(responses.GET, - 'http://dummy-postgrest/pdf_meta?sha1hex=eq.{}'.format( - "90ffd2359008d82298821d16b21778c5c39aec36"), - status=200, - body=json.dumps([])) - responses.add(responses.POST, - 'http://dummy-grobid/api/processFulltextDocument', - status=200, - body=REAL_TEI_XML, - content_type='text/xml') + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT) + ) + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=pdf_bytes, + ) + responses.add( + responses.GET, + "http://dummy-postgrest/grobid?sha1hex=eq.{}".format( + "90ffd2359008d82298821d16b21778c5c39aec36" + ), + status=200, + body=json.dumps([]), + ) + responses.add( + responses.GET, + "http://dummy-postgrest/pdf_meta?sha1hex=eq.{}".format( + "90ffd2359008d82298821d16b21778c5c39aec36" + ), + status=200, + body=json.dumps([]), + ) + responses.add( + responses.POST, + "http://dummy-grobid/api/processFulltextDocument", + status=200, + body=REAL_TEI_XML, + content_type="text/xml", + ) resp = ingest_worker_pdf.process(request) print(resp) - assert resp['hit'] is True - assert resp['status'] == "success" - assert resp['request'] == request - assert resp['terminal']['terminal_sha1hex'] == resp['file_meta']['sha1hex'] - assert type(resp['terminal']['terminal_dt']) == str - assert resp['terminal']['terminal_url'] == TARGET + "/redirect" - assert resp['terminal']['terminal_status_code'] - assert type(resp['file_meta']['size_bytes']) == int - assert resp['file_meta']['mimetype'] == "application/pdf" - assert resp['cdx']['url'] == TARGET + "/redirect" - assert 'warc_path' not in resp['cdx'] - assert 'revisit_cdx' not in resp - assert resp['grobid']['status'] == "success" - assert resp['grobid']['status_code'] == 200 - assert resp['grobid']['grobid_version'] - assert 'fatcat_release' in resp['grobid'] - assert 'grobid_version' not in resp['grobid']['metadata'] - assert 'fatcat_release' not in resp['grobid']['metadata'] - assert 'tei_xml' not in resp['grobid'] - assert resp['pdf_meta']['status'] == "success" - assert resp['pdf_meta']['pdf_extra']['page_count'] == 1 - assert resp['pdf_meta'].get('text') is None + assert resp["hit"] is True + assert resp["status"] == "success" + assert resp["request"] == request + assert resp["terminal"]["terminal_sha1hex"] == resp["file_meta"]["sha1hex"] + assert type(resp["terminal"]["terminal_dt"]) == str + assert resp["terminal"]["terminal_url"] == TARGET + "/redirect" + assert resp["terminal"]["terminal_status_code"] + assert type(resp["file_meta"]["size_bytes"]) == int + assert resp["file_meta"]["mimetype"] == "application/pdf" + assert resp["cdx"]["url"] == TARGET + "/redirect" + assert "warc_path" not in resp["cdx"] + assert "revisit_cdx" not in resp + assert resp["grobid"]["status"] == "success" + assert resp["grobid"]["status_code"] == 200 + assert resp["grobid"]["grobid_version"] + assert "fatcat_release" in resp["grobid"] + assert "grobid_version" not in resp["grobid"]["metadata"] + assert "fatcat_release" not in resp["grobid"]["metadata"] + assert "tei_xml" not in resp["grobid"] + assert resp["pdf_meta"]["status"] == "success" + assert resp["pdf_meta"]["pdf_extra"]["page_count"] == 1 + assert resp["pdf_meta"].get("text") is None @responses.activate def test_ingest_landing(ingest_worker): request = { - 'ingest_type': 'pdf', - 'base_url': "http://dummy-host/", + "ingest_type": "pdf", + "base_url": "http://dummy-host/", } - responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({ - "url": TARGET, - "job_id": JOB_ID - })) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) - responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SPN_HIT)) - responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", - TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body=WARC_BODY) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT) + ) + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=WARC_BODY, + ) # this is for second time around; don't want to fetch same landing page # HTML again and result in a loop - responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", - TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body="<html></html>") + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body="<html></html>", + ) resp = ingest_worker.process(request) print(resp) - assert resp['hit'] is False - assert resp['status'] == "no-pdf-link" - assert resp['request'] == request - assert 'terminal' in resp - assert 'file_meta' not in resp - assert 'cdx' not in resp - assert 'revisit_cdx' not in resp - assert 'grobid' not in resp + assert resp["hit"] is False + assert resp["status"] == "no-pdf-link" + assert resp["request"] == request + assert "terminal" in resp + assert "file_meta" not in resp + assert "cdx" not in resp + assert "revisit_cdx" not in resp + assert "grobid" not in resp @responses.activate def test_ingest_blocklist(ingest_worker): ingest_worker.base_url_blocklist = [ - '://test.fatcat.wiki/', + "://test.fatcat.wiki/", ] request = { - 'ingest_type': 'pdf', - 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + "ingest_type": "pdf", + "base_url": "https://test.fatcat.wiki/asdfasdf.pdf", } resp = ingest_worker.process(request) - assert resp['hit'] is False - assert resp['status'] == "skip-url-blocklist" - assert resp['request'] == request + assert resp["hit"] is False + assert resp["status"] == "skip-url-blocklist" + assert resp["request"] == request @responses.activate def test_ingest_wall_blocklist(ingest_worker): ingest_worker.wall_blocklist = [ - '://test.fatcat.wiki/', + "://test.fatcat.wiki/", ] request = { - 'ingest_type': 'pdf', - 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + "ingest_type": "pdf", + "base_url": "https://test.fatcat.wiki/asdfasdf.pdf", } resp = ingest_worker.process(request) - assert resp['hit'] is False - assert resp['status'] == "skip-wall" - assert resp['request'] == request + assert resp["hit"] is False + assert resp["status"] == "skip-wall" + assert resp["request"] == request @responses.activate def test_ingest_cookie_blocklist(ingest_worker): request = { - 'ingest_type': 'pdf', - 'base_url': "https://test.fatcat.wiki/cookieAbsent", + "ingest_type": "pdf", + "base_url": "https://test.fatcat.wiki/cookieAbsent", } resp = ingest_worker.process(request) - assert resp['hit'] is False - assert resp['status'] == "blocked-cookie" - assert resp['request'] == request + assert resp["hit"] is False + assert resp["status"] == "blocked-cookie" + assert resp["request"] == request diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py index f82eac4..9bd8b5f 100644 --- a/python/tests/test_live_wayback.py +++ b/python/tests/test_live_wayback.py @@ -43,7 +43,10 @@ def test_cdx_fetch(cdx_client): assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV" assert resp.warc_csize == 25338 assert resp.warc_offset == 240665973 - assert resp.warc_path == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz" + assert ( + resp.warc_path + == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz" + ) # bogus datetime; shouldn't match with pytest.raises(KeyError): @@ -73,8 +76,9 @@ def test_cdx_lookup_best(cdx_client): def test_wayback_fetch(wayback_client): resp = wayback_client.fetch_petabox( - 25683, 2676464871, - "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz" + 25683, + 2676464871, + "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz", ) assert resp.body @@ -112,9 +116,9 @@ def test_cdx_fetch_spn2(cdx_client): # https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410 - #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz - #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz - #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz + # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz + # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz + # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209" datetime = "20200110222410" @@ -145,7 +149,7 @@ def test_lookup_ftp(wayback_client): assert resp.revisit_cdx.url != url file_meta = gen_file_metadata(resp.body) - assert file_meta['sha1hex'] == resp.cdx.sha1hex + assert file_meta["sha1hex"] == resp.cdx.sha1hex # not revisit? url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf" @@ -158,7 +162,7 @@ def test_lookup_ftp(wayback_client): assert resp.cdx.url == url file_meta = gen_file_metadata(resp.body) - assert file_meta['sha1hex'] == resp.cdx.sha1hex + assert file_meta["sha1hex"] == resp.cdx.sha1hex @pytest.mark.skip(reason="hits prod services, requires auth") @@ -168,10 +172,10 @@ def test_crawl_ftp(spn_client, wayback_client): resp = spn_client.crawl_resource(url, wayback_client) # FTP isn't supported yet! - #assert resp.hit is True - #assert resp.status == "success" - #assert resp.terminal_url == url - #assert resp.cdx.url == url + # assert resp.hit is True + # assert resp.status == "success" + # assert resp.terminal_url == url + # assert resp.cdx.url == url assert resp.hit is False assert resp.status == "spn2-no-ftp" diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 5830dc9..2bad851 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -1,77 +1,87 @@ import pytest -from sandcrawler import (b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, - parse_cdx_line) +from sandcrawler import ( + b32_hex, + clean_url, + gen_file_metadata, + gen_file_metadata_path, + parse_cdx_line, +) def test_gen_file_metadata(): # valid (but very small) PDF file - with open('tests/files/dummy.pdf', 'rb') as f: + with open("tests/files/dummy.pdf", "rb") as f: file_meta = gen_file_metadata(f.read()) assert file_meta == { - 'mimetype': 'application/pdf', - 'md5hex': '2942bfabb3d05332b66eb128e0842cff', - 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36', - 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4', - 'size_bytes': 13264, + "mimetype": "application/pdf", + "md5hex": "2942bfabb3d05332b66eb128e0842cff", + "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36", + "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4", + "size_bytes": 13264, } # valid HTML fm = gen_file_metadata( - b"""<html><head><title>dummy</title></head><body>html document</body></html>""") - assert fm['mimetype'] == 'text/html' + b"""<html><head><title>dummy</title></head><body>html document</body></html>""" + ) + assert fm["mimetype"] == "text/html" # bogus text fm = gen_file_metadata(b"asdf1234") - assert fm['mimetype'] == 'text/plain' - assert fm['size_bytes'] == 8 + assert fm["mimetype"] == "text/plain" + assert fm["size_bytes"] == 8 def test_gen_file_metadata_path(): # valid (but very small) PDF file - file_meta = gen_file_metadata_path('tests/files/dummy.pdf') + file_meta = gen_file_metadata_path("tests/files/dummy.pdf") assert file_meta == { - 'mimetype': 'application/pdf', - 'md5hex': '2942bfabb3d05332b66eb128e0842cff', - 'sha1hex': '90ffd2359008d82298821d16b21778c5c39aec36', - 'sha256hex': '3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4', - 'size_bytes': 13264, + "mimetype": "application/pdf", + "md5hex": "2942bfabb3d05332b66eb128e0842cff", + "sha1hex": "90ffd2359008d82298821d16b21778c5c39aec36", + "sha256hex": "3df79d34abbca99308e79cb94461c1893582604d68329a41fd4bec1885e6adb4", + "size_bytes": 13264, } def test_b32_hex(): # valid b32 - assert b32_hex( - 'sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982' - assert b32_hex( - 'TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC') == '9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982' + assert ( + b32_hex("sha1:TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC") + == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982" + ) + assert ( + b32_hex("TZCYZ2ULEHYGESS4L3RNH75I23KKFSMC") + == "9e458cea8b21f0624a5c5ee2d3ffa8d6d4a2c982" + ) # sha1hex pass-through - s = 'bda3c1017d52e826bbd1da51efad877272d300f9' + s = "bda3c1017d52e826bbd1da51efad877272d300f9" assert b32_hex(s) == s # invalid with pytest.raises(ValueError): - assert b32_hex('blah') == 'blah' + assert b32_hex("blah") == "blah" def test_parse_cdx_line(): raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" correct = { - 'sha1b32': "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G", - 'sha1hex': "b2f65203da9929c2f758e8dd587b5524f904dbe6", - 'mimetype': "application/pdf", - 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", - 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", - 'datetime': "20170828233154", - 'warc_path': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", - 'warc_offset': 931661233, - 'warc_csize': 210251, - 'http_status': 200, + "sha1b32": "WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G", + "sha1hex": "b2f65203da9929c2f758e8dd587b5524f904dbe6", + "mimetype": "application/pdf", + "surt": "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", + "url": "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf", + "datetime": "20170828233154", + "warc_path": "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", + "warc_offset": 931661233, + "warc_csize": 210251, + "http_status": 200, } assert parse_cdx_line(raw) == correct @@ -92,5 +102,9 @@ def test_invalid_cdx(): def test_clean_url(): assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf" - assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \ - "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view" + assert ( + clean_url( + "https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view" + ) + == "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view" + ) diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py index 086243a..9d75655 100644 --- a/python/tests/test_pdfextract.py +++ b/python/tests/test_pdfextract.py @@ -15,30 +15,31 @@ def test_process_fake_pdf(): print(resp) assert resp.status == "not-pdf" - with open('tests/files/dummy_zip.zip', 'rb') as f: + with open("tests/files/dummy_zip.zip", "rb") as f: pdf_bytes = f.read() resp = process_pdf(pdf_bytes) - assert resp.status == 'not-pdf' + assert resp.status == "not-pdf" -@pytest.mark.skipif(poppler.version_string() == '0.71.0', - reason="unsupported version of poppler") +@pytest.mark.skipif( + poppler.version_string() == "0.71.0", reason="unsupported version of poppler" +) def test_process_dummy_pdf(): - with open('tests/files/dummy.pdf', 'rb') as f: + with open("tests/files/dummy.pdf", "rb") as f: pdf_bytes = f.read() resp = process_pdf(pdf_bytes) - assert resp.status == 'success' + assert resp.status == "success" assert resp.page0_thumbnail is not None assert len(resp.text) > 10 assert resp.meta_xml is None - assert resp.file_meta['mimetype'] == 'application/pdf' + assert resp.file_meta["mimetype"] == "application/pdf" print(resp.pdf_info) print(resp.pdf_extra) - assert resp.pdf_info['Author'] == "Evangelos Vlachogiannis" + assert resp.pdf_info["Author"] == "Evangelos Vlachogiannis" # 595 x 842 - assert resp.pdf_extra['page0_height'] == 842 - assert resp.pdf_extra['page0_width'] == 595 - assert resp.pdf_extra['page_count'] == 1 + assert resp.pdf_extra["page0_height"] == 842 + assert resp.pdf_extra["page0_width"] == 595 + assert resp.pdf_extra["page_count"] == 1 def test_pdfextract_worker_cdx(wayback_client): # noqa: F811 @@ -46,17 +47,17 @@ def test_pdfextract_worker_cdx(wayback_client): # noqa: F811 sink = BlackholeSink() worker = PdfExtractWorker(wayback_client, sink=sink, thumbnail_sink=sink) - with open('tests/files/example.cdx', 'r') as cdx_file: + with open("tests/files/example.cdx", "r") as cdx_file: pusher = CdxLinePusher( worker, cdx_file, filter_http_statuses=[200, 226], - filter_mimetypes=['application/pdf'], + filter_mimetypes=["application/pdf"], ) pusher_counts = pusher.run() - assert pusher_counts['total'] - assert pusher_counts['pushed'] == 7 - assert pusher_counts['pushed'] == worker.counts['total'] + assert pusher_counts["total"] + assert pusher_counts["pushed"] == 7 + assert pusher_counts["pushed"] == worker.counts["total"] def test_pdfextract_blob_worker(): @@ -64,7 +65,7 @@ def test_pdfextract_blob_worker(): sink = BlackholeSink() worker = PdfExtractBlobWorker(sink=sink, thumbnail_sink=sink) - with open('tests/files/dummy.pdf', 'rb') as f: + with open("tests/files/dummy.pdf", "rb") as f: pdf_bytes = f.read() worker.process(pdf_bytes) diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py index 353a560..ed17d24 100644 --- a/python/tests/test_pushers.py +++ b/python/tests/test_pushers.py @@ -6,22 +6,24 @@ def test_cdx_line_pusher(): sink = BlackholeSink() # vanilla (only default filters) - with open('tests/files/example.cdx', 'r') as cdx_file: + with open("tests/files/example.cdx", "r") as cdx_file: pusher = CdxLinePusher(sink, cdx_file) counts = pusher.run() - assert counts['total'] == 20 - assert counts['skip-parse'] == 1 - assert counts['pushed'] == 19 + assert counts["total"] == 20 + assert counts["skip-parse"] == 1 + assert counts["pushed"] == 19 # HTTP 200 and application/pdf - with open('tests/files/example.cdx', 'r') as cdx_file: - pusher = CdxLinePusher(sink, - cdx_file, - filter_mimetypes=['application/pdf'], - filter_http_statuses=[200, 226]) + with open("tests/files/example.cdx", "r") as cdx_file: + pusher = CdxLinePusher( + sink, + cdx_file, + filter_mimetypes=["application/pdf"], + filter_http_statuses=[200, 226], + ) counts = pusher.run() - assert counts['total'] == 20 - assert counts['skip-parse'] == 1 - assert counts['skip-http_status'] == 10 - assert counts['skip-mimetype'] == 2 - assert counts['pushed'] == 7 + assert counts["total"] == 20 + assert counts["skip-parse"] == 1 + assert counts["skip-http_status"] == 10 + assert counts["skip-mimetype"] == 2 + assert counts["pushed"] == 7 diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py index 50cabb4..f3a5e46 100644 --- a/python/tests/test_savepagenow.py +++ b/python/tests/test_savepagenow.py @@ -15,7 +15,7 @@ PENDING_BODY = { "https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js", "https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.21/jquery-ui.min.js", "https://cdn.onesignal.com/sdks/OneSignalSDK.js", - ] + ], } SUCCESS_BODY = { "status": "success", @@ -25,7 +25,9 @@ SUCCESS_BODY = { "timestamp": "20180326070330", "duration_sec": 6.203, "resources": [ - TARGET, TARGET + "/redirect", "http://brewster.kahle.org/", + TARGET, + TARGET + "/redirect", + "http://brewster.kahle.org/", "http://brewster.kahle.org/favicon.ico", "http://brewster.kahle.org/files/2011/07/bkheader-follow.jpg", "http://brewster.kahle.org/files/2016/12/amazon-unhappy.jpg", @@ -40,7 +42,8 @@ SUCCESS_BODY = { "http://brewster.kahle.org/wp-content/themes/twentyten/style.css", "http://brewster.kahle.org/wp-includes/js/wp-embed.min.js?ver=4.9.4", "http://brewster.kahle.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4", - "http://platform.twitter.com/widgets.js", "https://archive-it.org/piwik.js", + "http://platform.twitter.com/widgets.js", + "https://archive-it.org/piwik.js", "https://platform.twitter.com/jot.html", "https://platform.twitter.com/js/button.556f0ea0e4da4e66cfdc182016dbd6db.js", "https://platform.twitter.com/widgets/follow_button.f47a2e0b4471326b6fa0f163bda46011.en.html", @@ -54,12 +57,12 @@ SUCCESS_BODY = { "https://www.syndikat.org/wp-content/uploads/2017/11/s_miete_fr-200x116.png", "https://www.syndikat.org/wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1", "https://www.syndikat.org/wp-includes/js/jquery/jquery.js?ver=1.12.4", - "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4" + "https://www.syndikat.org/wp-includes/js/wp-emoji-release.min.js?ver=4.9.4", ], "outlinks": { "https://archive.org/": "xxxxxx89b-f3ca-48d0-9ea6-1d1225e98695", - "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695" - } + "https://other.com": "yyyy89b-f3ca-48d0-9ea6-1d1225e98695", + }, } ERROR_BODY = { "status": "error", @@ -67,17 +70,34 @@ ERROR_BODY = { "status_ext": "error:invalid-host-resolution", "job_id": JOB_ID, "message": "Couldn't resolve host for http://example5123.com.", - "resources": [] + "resources": [], } CDX_SPN_HIT = [ [ - "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect", - "robotflags", "length", "offset", "filename" + "urlkey", + "timestamp", + "original", + "mimetype", + "statuscode", + "digest", + "redirect", + "robotflags", + "length", + "offset", + "filename", ], [ - "wiki,fatcat)/", "20180326070330", TARGET + "/redirect", "application/pdf", "200", - CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", - "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz" + "wiki,fatcat)/", + "20180326070330", + TARGET + "/redirect", + "application/pdf", + "200", + CDX_BEST_SHA1B32, + "-", + "-", + "8445", + "108062304", + "liveweb-20200108215212-wwwb-spn04.us.archive.org-kols1pud.warc.gz", ], ] @@ -96,25 +116,30 @@ def spn_client(): @responses.activate def test_savepagenow_success(spn_client): - responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({ - "url": TARGET, - "job_id": JOB_ID - })) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) resp = spn_client.save_url_now_v2(TARGET) @@ -124,35 +149,38 @@ def test_savepagenow_success(spn_client): assert resp.status == "success" assert resp.request_url == TARGET assert resp.terminal_url == TARGET + "/redirect" - assert resp.terminal_dt == SUCCESS_BODY['timestamp'] - assert resp.resources == SUCCESS_BODY['resources'] + assert resp.terminal_dt == SUCCESS_BODY["timestamp"] + assert resp.resources == SUCCESS_BODY["resources"] @responses.activate def test_savepagenow_remote_error(spn_client): - responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({ - "url": TARGET, - "job_id": JOB_ID - })) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(ERROR_BODY)) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(ERROR_BODY), + ) resp = spn_client.save_url_now_v2(TARGET) assert len(responses.calls) == 3 assert resp.success is False - assert resp.status == ERROR_BODY['status_ext'] + assert resp.status == ERROR_BODY["status_ext"] assert resp.request_url == TARGET assert resp.terminal_url is None assert resp.terminal_dt is None @@ -162,17 +190,18 @@ def test_savepagenow_remote_error(spn_client): @responses.activate def test_savepagenow_500(spn_client): - responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({ - "url": TARGET, - "job_id": JOB_ID - })) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=500, - body=json.dumps(ERROR_BODY)) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=500, + body=json.dumps(ERROR_BODY), + ) with pytest.raises(SavePageNowError): spn_client.save_url_now_v2(TARGET) @@ -183,33 +212,36 @@ def test_savepagenow_500(spn_client): @responses.activate def test_crawl_resource(spn_client, wayback_client): - responses.add(responses.POST, - 'http://dummy-spnv2/save', - status=200, - body=json.dumps({ - "url": TARGET, - "job_id": JOB_ID - })) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(PENDING_BODY)) - responses.add(responses.GET, - 'http://dummy-spnv2/save/status/' + JOB_ID, - status=200, - body=json.dumps(SUCCESS_BODY)) - responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SPN_HIT)) - responses.add(responses.GET, - 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", - TARGET + "/redirect"), - status=200, - headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, - body=WARC_BODY) - - print('https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect")) + responses.add( + responses.POST, + "http://dummy-spnv2/save", + status=200, + body=json.dumps({"url": TARGET, "job_id": JOB_ID}), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(PENDING_BODY), + ) + responses.add( + responses.GET, + "http://dummy-spnv2/save/status/" + JOB_ID, + status=200, + body=json.dumps(SUCCESS_BODY), + ) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SPN_HIT) + ) + responses.add( + responses.GET, + "https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body=WARC_BODY, + ) + + print("https://web.archive.org/web/{}id_/{}".format("20180326070330", TARGET + "/redirect")) resp = spn_client.crawl_resource(TARGET, wayback_client) assert len(responses.calls) == 5 diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index 0cb59fa..da4dfd8 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -10,13 +10,30 @@ CDX_DT = "20180812220054" # cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ CDX_SINGLE_HIT = [ [ - "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect", - "robotflags", "length", "offset", "filename" + "urlkey", + "timestamp", + "original", + "mimetype", + "statuscode", + "digest", + "redirect", + "robotflags", + "length", + "offset", + "filename", ], [ - "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", - "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", - "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + "wiki,fatcat)/", + CDX_DT, + CDX_TARGET, + "text/html", + "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", ], ] @@ -24,47 +41,112 @@ CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR" # cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ CDX_MULTI_HIT = [ [ - "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect", - "robotflags", "length", "offset", "filename" + "urlkey", + "timestamp", + "original", + "mimetype", + "statuscode", + "digest", + "redirect", + "robotflags", + "length", + "offset", + "filename", ], [ - "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", - "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", - "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + "wiki,fatcat)/", + CDX_DT, + CDX_TARGET, + "text/html", + "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", ], # sooner, but not right mimetype [ - "wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", - "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", - "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + "wiki,fatcat)/", + "20180912220054", + CDX_TARGET, + "text/html", + "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", ], # sooner and mimetype, but wrong status code [ - "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", - "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", - "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + "wiki,fatcat)/", + "20180912220054", + CDX_TARGET, + "application/pdf", + "400", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", ], [ - "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", - "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", - "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + "wiki,fatcat)/", + "20180912220054", + CDX_TARGET, + "application/pdf", + "500", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", ], [ - "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", - "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", - "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + "wiki,fatcat)/", + "20180912220054", + CDX_TARGET, + "application/pdf", + "150", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", ], # "best" [ - "wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", - "-", "8445", "108062304", - "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + "wiki,fatcat)/", + CDX_DT, + CDX_TARGET, + "application/pdf", + "200", + CDX_BEST_SHA1B32, + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", ], # older [ - "wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", - "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", - "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + "wiki,fatcat)/", + "20180712220054", + CDX_TARGET, + "application/pdf", + "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", ], ] @@ -81,10 +163,9 @@ def cdx_client(): @responses.activate def test_cdx_fetch(cdx_client): - responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SINGLE_HIT)) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT) + ) resp = cdx_client.fetch(CDX_TARGET, CDX_DT) @@ -104,10 +185,9 @@ def test_cdx_fetch_errors(cdx_client): with pytest.raises(ValueError): resp = cdx_client.fetch(CDX_TARGET, "2019") - responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_SINGLE_HIT)) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT) + ) with pytest.raises(KeyError): resp = cdx_client.fetch(CDX_TARGET, "20180812220055") @@ -123,10 +203,9 @@ def test_cdx_fetch_errors(cdx_client): @responses.activate def test_cdx_lookup_best(cdx_client): - responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_MULTI_HIT)) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT) + ) resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf") @@ -175,7 +254,7 @@ def wayback_client(cdx_client, mocker): @pytest.fixture def wayback_client_pdf(cdx_client, mocker): - with open('tests/files/dummy.pdf', 'rb') as f: + with open("tests/files/dummy.pdf", "rb") as f: pdf_bytes = f.read() client = WaybackClient( @@ -209,10 +288,9 @@ def test_wayback_fetch(wayback_client): @responses.activate def test_lookup_resource_success(wayback_client): - responses.add(responses.GET, - 'http://dummy-cdx/cdx', - status=200, - body=json.dumps(CDX_MULTI_HIT)) + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT) + ) resp = wayback_client.lookup_resource(CDX_TARGET) diff --git a/python/tests/test_xml.py b/python/tests/test_xml.py index 1742f3a..786f863 100644 --- a/python/tests/test_xml.py +++ b/python/tests/test_xml.py @@ -5,7 +5,7 @@ from sandcrawler.xml import xml_reserialize def test_xml_reserialize() -> None: - with open('tests/files/scielo_article.jats.xml', 'rb') as f: + with open("tests/files/scielo_article.jats.xml", "rb") as f: raw_xml = f.read() assert b'encoding="ISO-8859-1"' in raw_xml |