diff options
Diffstat (limited to 'python/scripts/doaj2ingestrequest.py')
-rwxr-xr-x | python/scripts/doaj2ingestrequest.py | 15 |
1 files changed, 8 insertions, 7 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index 15b30a0..84a2c2c 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Transform an DOAJ article dump (JSON) into ingest requests. @@ -42,22 +41,22 @@ CONTENT_TYPE_MAP = { "abstract": [], "doc": [], "": ["pdf"], - "doi": ["pdf"], "url": ["pdf"], "fulltext": ["pdf"], "anySimpleType": ["pdf"], - "application/pdf": ["pdf"], "html": ["html", "pdf"], "text/html": ["html", "pdf"], "xml": ["xml"], } + def canon(s: str) -> str: parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed)) + def transform(obj: dict) -> List[dict]: """ Transforms from a single DOAJ object to zero or more ingest requests. @@ -118,6 +117,7 @@ def transform(obj: dict) -> List[dict]: return requests + def run(args) -> None: for l in args.json_file: if not l.strip(): @@ -128,17 +128,18 @@ def run(args) -> None: for r in requests: print("{}".format(json.dumps(r, sort_keys=True))) + def main() -> None: - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('json_file', - help="DOAJ article dump file to use", - type=argparse.FileType('r')) + help="DOAJ article dump file to use", + type=argparse.FileType('r')) subparsers = parser.add_subparsers() args = parser.parse_args() run(args) + if __name__ == '__main__': main() |