python/scripts/doaj2ingestrequest.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

#!/usr/bin/env python3
"""
Transform an DOAJ article dump (JSON) into ingest requests.

TODO: should we also attempt PDF ingest for HTML links? They seem to often be
landing pages. Or could have some pipeline that notices, eg, `citation_pdf_url`
in the HTML headers and adds an ingest request on that basis. Or even just run
the re-ingest in-process and publish a second result.
"""

import argparse
import json
import sys
from typing import List, Optional

import urlcanon

DOMAIN_BLOCKLIST = [
    # large OA publishers (we get via DOI)
    # large repos and aggregators (we crawl directly)
    "://arxiv.org/",
    "://europepmc.org/",
    "ncbi.nlm.nih.gov/",
    # "semanticscholar.org/",
    "://doi.org/",
    "://dx.doi.org/",
    "zenodo.org/",
    "figshare.com/",
    "://archive.org/",
    ".archive.org/",
    # large publishers/platforms; may remove in the future
    # "://link.springer.com/",
    # "://dergipark.gov.tr/",
    # "frontiersin.org/",
    # "scielo",
]

# these default to PDF; note that we also do pdf ingests for HTML pages
CONTENT_TYPE_MAP = {
    "abstract": [],
    "doc": [],
    "": ["pdf"],
    "doi": ["pdf"],
    "url": ["pdf"],
    "fulltext": ["pdf"],
    "anySimpleType": ["pdf"],
    "application/pdf": ["pdf"],
    "html": ["html", "pdf"],
    "text/html": ["html", "pdf"],
    "xml": ["xml"],
}


def canon(s: str) -> str:
    parsed = urlcanon.parse_url(s)
    return str(urlcanon.whatwg(parsed))


def transform(obj: dict) -> List[dict]:
    """
    Transforms from a single DOAJ object to zero or more ingest requests.
    Returns a list of dicts.
    """

    doaj_id = obj["id"].lower()
    assert doaj_id

    bibjson = obj["bibjson"]
    if not bibjson["link"]:
        return []

    requests = []

    doi: Optional[str] = None
    for ident in bibjson["identifier"] or []:
        if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
            doi = ident["id"].lower()

    for link in bibjson["link"] or []:
        if link.get("type") != "fulltext" or not link.get("url"):
            continue
        ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
        if not ingest_types:
            continue

        skip = False
        for domain in DOMAIN_BLOCKLIST:
            if domain in link["url"].lower():
                skip = True
        if skip:
            continue
        try:
            base_url = canon(link["url"].strip())
        except UnicodeEncodeError:
            continue

        if not base_url or len(base_url) > 1000:
            continue

        for ingest_type in ingest_types:
            request = {
                "base_url": base_url,
                "ingest_type": ingest_type,
                "link_source": "doaj",
                "link_source_id": doaj_id,
                "ingest_request_source": "doaj",
                "release_stage": "published",
                "rel": "publisher",
                "ext_ids": {
                    "doi": doi,
                    "doaj": doaj_id,
                },
                "edit_extra": {},
            }
            requests.append(request)

    return requests


def run(args) -> None:
    for l in args.json_file:
        if not l.strip():
            continue
        row = json.loads(l)

        requests = transform(row) or []
        for r in requests:
            print("{}".format(json.dumps(r, sort_keys=True)))


def main() -> None:
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
    )
    subparsers = parser.add_subparsers()

    args = parser.parse_args()

    run(args)


if __name__ == "__main__":
    main()