diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:17:35 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:17:35 +0200 |
commit | 2f948cfbb484241178fa7e8c7abd8b0c40a9db24 (patch) | |
tree | 5184838a7daba6e54333dacde4d3afeac0384b28 | |
parent | 15b4c403141e9477fb9c6c89f6ad4d27c6207a43 (diff) | |
download | fuzzycat-2f948cfbb484241178fa7e8c7abd8b0c40a9db24.tar.gz fuzzycat-2f948cfbb484241178fa7e8c7abd8b0c40a9db24.zip |
cleanup handling: add parameter
allow string cleanup be called directly
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | fuzzycat/__init__.py | 1 | ||||
-rw-r--r-- | fuzzycat/issn.py | 40 | ||||
-rw-r--r-- | fuzzycat/utils.py | 3 |
4 files changed, 26 insertions, 19 deletions
@@ -130,3 +130,4 @@ dmypy.json # Data dir /data +*file.db diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py index 58ccd5f..7feffd5 100644 --- a/fuzzycat/__init__.py +++ b/fuzzycat/__init__.py @@ -3,3 +3,4 @@ __version__ = "0.1.1" from fuzzycat.matching import match_container_names from fuzzycat.status import MatchStatus from fuzzycat.utils import * +from fuzzycat.journals import JournalLookup diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py index 28d2812..46786c9 100644 --- a/fuzzycat/issn.py +++ b/fuzzycat/issn.py @@ -167,9 +167,11 @@ import shelve import sys from typing import Dict, Iterable, List, Union +from fuzzycat import cleanups from fuzzycat.utils import (SetEncoder, StringPipeline, normalize_ampersand, normalize_whitespace) + def listify(v: Union[str, List[str]]) -> List[str]: """ Sensible create a list. @@ -272,16 +274,7 @@ def de_jsonld(lines: Iterable): print(json.dumps(doc, cls=SetEncoder)) -# These transformations should not affect the name or a journal. -cleanup_pipeline = StringPipeline([ - str.lower, - normalize_whitespace, - normalize_ampersand, - lambda v: v.rstrip("."), -]) - - -def generate_name_pairs(lines: Iterable, cleanup_pipeline=cleanup_pipeline): +def generate_name_pairs(lines: Iterable, cleanup_pipeline=None): """ Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on errors. Proto unit test data. @@ -314,29 +307,30 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=cleanup_pipeline): print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr) continue for a, b in itertools.combinations(doc.get("names", []), 2): - a = cleanup_pipeline.run(a) - b = cleanup_pipeline.run(b) + if cleanup_pipeline: + a = cleanup_pipeline(a) + b = cleanup_pipeline(b) yield (doc["issnl"], a, b) -def generate_name_issn_mapping(lines: Iterable): +def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None): """ Given JSON lines, generate a dictionary mapping names sets of ISSN. Names might be reused. """ mapping = collections.defaultdict(set) - for issnl, a, b in generate_name_pairs(lines): + for issnl, a, b in generate_name_pairs(lines, cleanup_pipeline=cleanup_pipeline): mapping[a].add(issnl) mapping[b].add(issnl) return mapping -def generate_shelve(lines: Iterable, output: str): +def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None): """ Generate a persistent key value store from name issn mappings. """ with shelve.open(output) as db: - for name, issnls in generate_name_issn_mapping(lines).items(): + for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items(): db[name] = issnls print("wrote {} keys to {}".format(len(db), output), file=sys.stderr) @@ -361,16 +355,24 @@ def main(): type=str, default="output.file", help="write output to file") + parser.add_argument("-c", + "--cleanup", + type=str, + default=None, + help="cleanup pipeline name") parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld") args = parser.parse_args() + # Map more cleanup routines. + cleanup = dict(basic=cleanups.basic).get(args.cleanup) + if args.make_mapping: - print(json.dumps(generate_name_issn_mapping(args.file), cls=SetEncoder)) + print(json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup), cls=SetEncoder)) if args.make_pairs: - for issn, a, b in generate_name_pairs(args.file): + for issn, a, b in generate_name_pairs(args.file, cleanup_pipeline=cleanup): print("{}\t{}\t{}".format(issn, a, b)) if args.de_jsonld: de_jsonld(args.file) if args.make_shelve: - generate_shelve(args.file, output=args.output) + generate_shelve(args.file, output=args.output, cleanup_pipeline=cleanup) diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 97125ce..ab693eb 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -44,6 +44,9 @@ class StringPipeline: def __init__(self, fs: List[Callable[[str], str]]): self.fs = fs + def __call__(self, s: str) -> str: + return self.run(s) + def run(self, s: str) -> str: """ Apply all function and return result. |