basic DOAJ ingest request conversion script

author: Bryan Newbold <bnewbold@archive.org> 2020-11-08 21:52:47 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-08 21:52:47 -0800
commit: 6a701f966b8bc760bf904c0569562b0159e13559 (patch)
tree: d440b3a9379b716de8aca3f9543249c240016ae0 /python/scripts
parent: ecd36863e607e3c9e71fd91ece44a422f88dbe2e (diff)
download: sandcrawler-6a701f966b8bc760bf904c0569562b0159e13559.tar.gz
sandcrawler-6a701f966b8bc760bf904c0569562b0159e13559.zip
1 files changed, 139 insertions, 0 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
new file mode 100755
index 0000000..f1bae8c
--- /dev/null
+++ b/python/scripts/doaj2ingestrequest.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+
+"""
+Transform an DOAJ article dump (JSON) into ingest requests.
+
+TODO: should we also attempt PDF ingest for HTML links? They seem to often be
+landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url`
+in the HTML headers and adds an ingest request on that basis. Or even just run
+the re-ingest in-process and publish a second result.
+"""
+
+import sys
+import json
+import argparse
+import urlcanon
+from typing import Optional, List
+
+DOMAIN_BLOCKLIST = [
+    # large OA publishers (we get via DOI)
+
+    # large repos and aggregators (we crawl directly)
+    "://arxiv.org/",
+    "://europepmc.org/",
+    "ncbi.nlm.nih.gov/",
+    #"semanticscholar.org/",
+    "://doi.org/",
+    "zenodo.org/",
+    "figshare.com/",
+    "://archive.org/",
+    ".archive.org/",
+
+    # large publishers/platforms; may remove in the future
+    #"://link.springer.com/",
+    #"://dergipark.gov.tr/",
+    #"frontiersin.org/",
+    #"scielo",
+]
+
+# these default to PDF; note that we also do pdf ingests for HTML pages
+CONTENT_TYPE_MAP = {
+    "abstract": [],
+    "doc": [],
+    "": ["pdf"],
+
+    "doi": ["pdf"],
+    "url": ["pdf"],
+    "fulltext": ["pdf"],
+    "anySimpleType": ["pdf"],
+
+    "application/pdf": ["pdf"],
+    "html": ["html", "pdf"],
+    "text/html": ["html", "pdf"],
+    "xml": ["xml"],
+}
+
+def canon(s: str) -> str:
+    parsed = urlcanon.parse_url(s)
+    return str(urlcanon.whatwg(parsed))
+
+def transform(obj: dict) -> List[dict]:
+    """
+    Transforms from a single DOAJ object to zero or more ingest requests.
+    Returns a list of dicts.
+    """
+
+    doaj_id = obj['id'].lower()
+    assert doaj_id
+
+    bibjson = obj['bibjson']
+    if not bibjson['link']:
+        return []
+
+    requests = []
+
+    doi: Optional[str] = None
+    for ident in (bibjson['identifier'] or []):
+        if ident['type'].lower() == "doi" and ident['id'].startswith('10.'):
+            doi = ident['id'].lower()
+
+    for link in (bibjson['link'] or []):
+        if link.get('type') != "fulltext" or not link.get('url'):
+            continue
+        ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower())
+        if not ingest_types:
+            continue
+        skip = False
+        for domain in DOMAIN_BLOCKLIST:
+            if domain in link['url'].lower():
+                skip = True
+        if skip:
+            continue
+        try:
+            base_url = canon(link['url'])
+        except UnicodeEncodeError:
+            continue
+
+        for ingest_type in ingest_types:
+            request = {
+                'base_url': base_url,
+                'ingest_type': ingest_type,
+                'link_source': 'doaj',
+                'link_source_id': doaj_id,
+                'ingest_request_source': 'doaj',
+                'release_stage': 'published',
+                'rel': 'publisher',
+                'ext_ids': {
+                    'doi': doi,
+                    'doaj': doaj_id,
+                },
+                'edit_extra': {},
+            }
+            requests.append(request)
+
+    return requests
+
+def run(args) -> None:
+    for l in args.json_file:
+        if not l.strip():
+            continue
+        row = json.loads(l)
+
+        requests = transform(row) or []
+        for r in requests:
+            print("{}".format(json.dumps(r, sort_keys=True)))
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('json_file',
+        help="DOAJ article dump file to use",
+        type=argparse.FileType('r'))
+    subparsers = parser.add_subparsers()
+
+    args = parser.parse_args()
+
+    run(args)
+
+if __name__ == '__main__':
+    main()
author	Bryan Newbold <bnewbold@archive.org>	2020-11-08 21:52:47 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-08 21:52:47 -0800
commit	6a701f966b8bc760bf904c0569562b0159e13559 (patch)
tree	d440b3a9379b716de8aca3f9543249c240016ae0 /python/scripts
parent	ecd36863e607e3c9e71fd91ece44a422f88dbe2e (diff)
download	sandcrawler-6a701f966b8bc760bf904c0569562b0159e13559.tar.gz sandcrawler-6a701f966b8bc760bf904c0569562b0159e13559.zip