diff options
Diffstat (limited to 'python/scripts/doaj2ingestrequest.py')
-rwxr-xr-x | python/scripts/doaj2ingestrequest.py | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py new file mode 100755 index 0000000..aef5c12 --- /dev/null +++ b/python/scripts/doaj2ingestrequest.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Transform an DOAJ article dump (JSON) into ingest requests. + +TODO: should we also attempt PDF ingest for HTML links? They seem to often be +landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url` +in the HTML headers and adds an ingest request on that basis. Or even just run +the re-ingest in-process and publish a second result. +""" + +import argparse +import json +import sys +from typing import List, Optional + +import urlcanon + +DOMAIN_BLOCKLIST = [ + # large OA publishers (we get via DOI) + # large repos and aggregators (we crawl directly) + "://arxiv.org/", + "://europepmc.org/", + "ncbi.nlm.nih.gov/", + # "semanticscholar.org/", + "://doi.org/", + "://dx.doi.org/", + "zenodo.org/", + "figshare.com/", + "://archive.org/", + ".archive.org/", + # large publishers/platforms; may remove in the future + # "://link.springer.com/", + # "://dergipark.gov.tr/", + # "frontiersin.org/", + # "scielo", +] + +# these default to PDF; note that we also do pdf ingests for HTML pages +CONTENT_TYPE_MAP = { + "abstract": [], + "doc": [], + "": ["pdf"], + "doi": ["pdf"], + "url": ["pdf"], + "fulltext": ["pdf"], + "anySimpleType": ["pdf"], + "application/pdf": ["pdf"], + "html": ["html", "pdf"], + "text/html": ["html", "pdf"], + "xml": ["xml"], +} + + +def canon(s: str) -> str: + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) + + +def transform(obj: dict) -> List[dict]: + """ + Transforms from a single DOAJ object to zero or more ingest requests. + Returns a list of dicts. + """ + + doaj_id = obj["id"].lower() + assert doaj_id + + bibjson = obj["bibjson"] + if not bibjson["link"]: + return [] + + requests = [] + + doi: Optional[str] = None + for ident in bibjson["identifier"] or []: + if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."): + doi = ident["id"].lower() + + for link in bibjson["link"] or []: + if link.get("type") != "fulltext" or not link.get("url"): + continue + ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower()) + if not ingest_types: + continue + + skip = False + for domain in DOMAIN_BLOCKLIST: + if domain in link["url"].lower(): + skip = True + if skip: + continue + try: + base_url = canon(link["url"].strip()) + except UnicodeEncodeError: + continue + + if not base_url or len(base_url) > 1000: + continue + + for ingest_type in ingest_types: + request = { + "base_url": base_url, + "ingest_type": ingest_type, + "link_source": "doaj", + "link_source_id": doaj_id, + "ingest_request_source": "doaj", + "release_stage": "published", + "rel": "publisher", + "ext_ids": { + "doi": doi, + "doaj": doaj_id, + }, + "edit_extra": {}, + } + requests.append(request) + + return requests + + +def run(args) -> None: + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + requests = transform(row) or [] + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + + +def main() -> None: + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r") + ) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + + +if __name__ == "__main__": + main() |