aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/doaj2ingestrequest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/doaj2ingestrequest.py')
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py144
1 files changed, 144 insertions, 0 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
new file mode 100755
index 0000000..aef5c12
--- /dev/null
+++ b/python/scripts/doaj2ingestrequest.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Transform an DOAJ article dump (JSON) into ingest requests.
+
+TODO: should we also attempt PDF ingest for HTML links? They seem to often be
+landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url`
+in the HTML headers and adds an ingest request on that basis. Or even just run
+the re-ingest in-process and publish a second result.
+"""
+
+import argparse
+import json
+import sys
+from typing import List, Optional
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # large OA publishers (we get via DOI)
+ # large repos and aggregators (we crawl directly)
+ "://arxiv.org/",
+ "://europepmc.org/",
+ "ncbi.nlm.nih.gov/",
+ # "semanticscholar.org/",
+ "://doi.org/",
+ "://dx.doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://archive.org/",
+ ".archive.org/",
+ # large publishers/platforms; may remove in the future
+ # "://link.springer.com/",
+ # "://dergipark.gov.tr/",
+ # "frontiersin.org/",
+ # "scielo",
+]
+
+# these default to PDF; note that we also do pdf ingests for HTML pages
+CONTENT_TYPE_MAP = {
+ "abstract": [],
+ "doc": [],
+ "": ["pdf"],
+ "doi": ["pdf"],
+ "url": ["pdf"],
+ "fulltext": ["pdf"],
+ "anySimpleType": ["pdf"],
+ "application/pdf": ["pdf"],
+ "html": ["html", "pdf"],
+ "text/html": ["html", "pdf"],
+ "xml": ["xml"],
+}
+
+
+def canon(s: str) -> str:
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj: dict) -> List[dict]:
+ """
+ Transforms from a single DOAJ object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ doaj_id = obj["id"].lower()
+ assert doaj_id
+
+ bibjson = obj["bibjson"]
+ if not bibjson["link"]:
+ return []
+
+ requests = []
+
+ doi: Optional[str] = None
+ for ident in bibjson["identifier"] or []:
+ if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
+ doi = ident["id"].lower()
+
+ for link in bibjson["link"] or []:
+ if link.get("type") != "fulltext" or not link.get("url"):
+ continue
+ ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
+ if not ingest_types:
+ continue
+
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in link["url"].lower():
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(link["url"].strip())
+ except UnicodeEncodeError:
+ continue
+
+ if not base_url or len(base_url) > 1000:
+ continue
+
+ for ingest_type in ingest_types:
+ request = {
+ "base_url": base_url,
+ "ingest_type": ingest_type,
+ "link_source": "doaj",
+ "link_source_id": doaj_id,
+ "ingest_request_source": "doaj",
+ "release_stage": "published",
+ "rel": "publisher",
+ "ext_ids": {
+ "doi": doi,
+ "doaj": doaj_id,
+ },
+ "edit_extra": {},
+ }
+ requests.append(request)
+
+ return requests
+
+
+def run(args) -> None:
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()