#!/usr/bin/env python3 """ Transform an DOAJ article dump (JSON) into ingest requests. TODO: should we also attempt PDF ingest for HTML links? They seem to often be landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url` in the HTML headers and adds an ingest request on that basis. Or even just run the re-ingest in-process and publish a second result. """ import argparse import json import sys from typing import List, Optional import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", "ncbi.nlm.nih.gov/", # "semanticscholar.org/", "://doi.org/", "://dx.doi.org/", "zenodo.org/", "figshare.com/", "://archive.org/", ".archive.org/", # large publishers/platforms; may remove in the future # "://link.springer.com/", # "://dergipark.gov.tr/", # "frontiersin.org/", # "scielo", ] # these default to PDF; note that we also do pdf ingests for HTML pages CONTENT_TYPE_MAP = { "abstract": [], "doc": [], "": ["pdf"], "doi": ["pdf"], "url": ["pdf"], "fulltext": ["pdf"], "anySimpleType": ["pdf"], "application/pdf": ["pdf"], "html": ["html", "pdf"], "text/html": ["html", "pdf"], "xml": ["xml"], } def canon(s: str) -> str: parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) def transform(obj: dict) -> List[dict]: """ Transforms from a single DOAJ object to zero or more ingest requests. Returns a list of dicts. """ doaj_id = obj["id"].lower() assert doaj_id bibjson = obj["bibjson"] if not bibjson["link"]: return [] requests = [] doi: Optional[str] = None for ident in bibjson["identifier"] or []: if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."): doi = ident["id"].lower() for link in bibjson["link"] or []: if link.get("type") != "fulltext" or not link.get("url"): continue ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower()) if not ingest_types: continue skip = False for domain in DOMAIN_BLOCKLIST: if domain in link["url"].lower(): skip = True if skip: continue try: base_url = canon(link["url"].strip()) except UnicodeEncodeError: continue if not base_url or len(base_url) > 1000: continue for ingest_type in ingest_types: request = { "base_url": base_url, "ingest_type": ingest_type, "link_source": "doaj", "link_source_id": doaj_id, "ingest_request_source": "doaj", "release_stage": "published", "rel": "publisher", "ext_ids": { "doi": doi, "doaj": doaj_id, }, "edit_extra": {}, } requests.append(request) return requests def run(args) -> None: for l in args.json_file: if not l.strip(): continue row = json.loads(l) requests = transform(row) or [] for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r") ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) if __name__ == "__main__": main()