diff options
Diffstat (limited to 'python/scripts/doaj2ingestrequest.py')
-rwxr-xr-x | python/scripts/doaj2ingestrequest.py | 87 |
1 files changed, 46 insertions, 41 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index b981ab6..aef5c12 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Transform an DOAJ article dump (JSON) into ingest requests. @@ -9,31 +8,31 @@ in the HTML headers and adds an ingest request on that basis. Or even just run the re-ingest in-process and publish a second result. """ -import sys -import json import argparse +import json +import sys +from typing import List, Optional + import urlcanon -from typing import Optional, List DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) - # large repos and aggregators (we crawl directly) "://arxiv.org/", "://europepmc.org/", "ncbi.nlm.nih.gov/", - #"semanticscholar.org/", + # "semanticscholar.org/", "://doi.org/", + "://dx.doi.org/", "zenodo.org/", "figshare.com/", "://archive.org/", ".archive.org/", - # large publishers/platforms; may remove in the future - #"://link.springer.com/", - #"://dergipark.gov.tr/", - #"frontiersin.org/", - #"scielo", + # "://link.springer.com/", + # "://dergipark.gov.tr/", + # "frontiersin.org/", + # "scielo", ] # these default to PDF; note that we also do pdf ingests for HTML pages @@ -41,78 +40,83 @@ CONTENT_TYPE_MAP = { "abstract": [], "doc": [], "": ["pdf"], - "doi": ["pdf"], "url": ["pdf"], "fulltext": ["pdf"], "anySimpleType": ["pdf"], - "application/pdf": ["pdf"], "html": ["html", "pdf"], "text/html": ["html", "pdf"], "xml": ["xml"], } + def canon(s: str) -> str: parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) + def transform(obj: dict) -> List[dict]: """ Transforms from a single DOAJ object to zero or more ingest requests. Returns a list of dicts. """ - doaj_id = obj['id'].lower() + doaj_id = obj["id"].lower() assert doaj_id - bibjson = obj['bibjson'] - if not bibjson['link']: + bibjson = obj["bibjson"] + if not bibjson["link"]: return [] requests = [] doi: Optional[str] = None - for ident in (bibjson['identifier'] or []): - if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'): - doi = ident['id'].lower() + for ident in bibjson["identifier"] or []: + if ident["type"].lower() == "doi" and ident.get("id") and ident["id"].startswith("10."): + doi = ident["id"].lower() - for link in (bibjson['link'] or []): - if link.get('type') != "fulltext" or not link.get('url'): + for link in bibjson["link"] or []: + if link.get("type") != "fulltext" or not link.get("url"): continue - ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower()) + ingest_types = CONTENT_TYPE_MAP.get((link.get("content_type") or "").lower()) if not ingest_types: continue + skip = False for domain in DOMAIN_BLOCKLIST: - if domain in link['url'].lower(): + if domain in link["url"].lower(): skip = True if skip: continue try: - base_url = canon(link['url']) + base_url = canon(link["url"].strip()) except UnicodeEncodeError: continue + if not base_url or len(base_url) > 1000: + continue + for ingest_type in ingest_types: request = { - 'base_url': base_url, - 'ingest_type': ingest_type, - 'link_source': 'doaj', - 'link_source_id': doaj_id, - 'ingest_request_source': 'doaj', - 'release_stage': 'published', - 'rel': 'publisher', - 'ext_ids': { - 'doi': doi, - 'doaj': doaj_id, + "base_url": base_url, + "ingest_type": ingest_type, + "link_source": "doaj", + "link_source_id": doaj_id, + "ingest_request_source": "doaj", + "release_stage": "published", + "rel": "publisher", + "ext_ids": { + "doi": doi, + "doaj": doaj_id, }, - 'edit_extra': {}, + "edit_extra": {}, } requests.append(request) return requests + def run(args) -> None: for l in args.json_file: if not l.strip(): @@ -123,17 +127,18 @@ def run(args) -> None: for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main() -> None: - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="DOAJ article dump file to use", - type=argparse.FileType('r')) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="DOAJ article dump file to use", type=argparse.FileType("r") + ) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) -if __name__ == '__main__': + +if __name__ == "__main__": main() |