aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/doaj2ingestrequest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/doaj2ingestrequest.py')
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py87
1 files changed, 46 insertions, 41 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index b981ab6..aef5c12 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Transform an DOAJ article dump (JSON) into ingest requests.
@@ -9,31 +8,31 @@ in the HTML headers and adds an ingest request on that basis. Or even just run
the re-ingest in-process and publish a second result.
"""
-import sys
-import json
import argparse
+import json
+import sys
+from typing import List, Optional
+
import urlcanon
-from typing import Optional, List
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
-
# large repos and aggregators (we crawl directly)
"://arxiv.org/",
"://europepmc.org/",
"ncbi.nlm.nih.gov/",
- #"semanticscholar.org/",
+ # "semanticscholar.org/",
"://doi.org/",
+ "://dx.doi.org/",
"zenodo.org/",
"figshare.com/",
"://archive.org/",
".archive.org/",
-
# large publishers/platforms; may remove in the future
- #"://link.springer.com/",
- #"://dergipark.gov.tr/",
- #"frontiersin.org/",
- #"scielo",
+ # "://link.springer.com/",
+ # "://dergipark.gov.tr/",
+ # "frontiersin.org/",
+ # "scielo",
]
# these default to PDF; note that we also do pdf ingests for HTML pages
@@ -41,78 +40,83 @@ CONTENT_TYPE_MAP = {
"abstract": [],
"doc": [],
"": ["pdf"],
-
"doi": ["pdf"],
"url": ["pdf"],
"fulltext": ["pdf"],
"anySimpleType": ["pdf"],
-
"application/pdf": ["pdf"],
"html": ["html", "pdf"],
"text/html": ["html", "pdf"],
"xml": ["xml"],
}
+
def canon(s: str) -> str:
parsed = urlcanon.parse_url(s)
return str(urlcanon.whatwg(parsed))
+
def transform(obj: dict) -> List[dict]:
"""
Transforms from a single DOAJ object to zero or more ingest requests.
Returns a list of dicts.
"""
- doaj_id = obj['id'].lower()
+ doaj_id = obj["id"].lower()
assert doaj_id
- bibjson = obj['bibjson']
- if not bibjson['link']:
+ bibjson = obj["bibjson"]
+ if not bibjson["link"]:
return []
requests = []
doi: Optional[str] = None
- for ident in (bibjson['identifier'] or []):
- if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'):
- doi = ident['id'].lower()
+ for ident in bibjson["identifier"] or []:
+ if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."):
+ doi = ident["id"].lower()
- for link in (bibjson['link'] or []):
- if link.get('type') != "fulltext" or not link.get('url'):
+ for link in bibjson["link"] or []:
+ if link.get("type") != "fulltext" or not link.get("url"):
continue
- ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower())
+ ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower())
if not ingest_types:
continue
+
skip = False
for domain in DOMAIN_BLOCKLIST:
- if domain in link['url'].lower():
+ if domain in link["url"].lower():
skip = True
if skip:
continue
try:
- base_url = canon(link['url'])
+ base_url = canon(link["url"].strip())
except UnicodeEncodeError:
continue
+ if not base_url or len(base_url) > 1000:
+ continue
+
for ingest_type in ingest_types:
request = {
- 'base_url': base_url,
- 'ingest_type': ingest_type,
- 'link_source': 'doaj',
- 'link_source_id': doaj_id,
- 'ingest_request_source': 'doaj',
- 'release_stage': 'published',
- 'rel': 'publisher',
- 'ext_ids': {
- 'doi': doi,
- 'doaj': doaj_id,
+ "base_url": base_url,
+ "ingest_type": ingest_type,
+ "link_source": "doaj",
+ "link_source_id": doaj_id,
+ "ingest_request_source": "doaj",
+ "release_stage": "published",
+ "rel": "publisher",
+ "ext_ids": {
+ "doi": doi,
+ "doaj": doaj_id,
},
- 'edit_extra': {},
+ "edit_extra": {},
}
requests.append(request)
return requests
+
def run(args) -> None:
for l in args.json_file:
if not l.strip():
@@ -123,17 +127,18 @@ def run(args) -> None:
for r in requests:
print("{}".format(json.dumps(r, sort_keys=True)))
+
def main() -> None:
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('json_file',
- help="DOAJ article dump file to use",
- type=argparse.FileType('r'))
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r")
+ )
subparsers = parser.add_subparsers()
args = parser.parse_args()
run(args)
-if __name__ == '__main__':
+
+if __name__ == "__main__":
main()