diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:17:35 +0200 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-08-15 18:17:35 +0200 | 
| commit | 2f948cfbb484241178fa7e8c7abd8b0c40a9db24 (patch) | |
| tree | 5184838a7daba6e54333dacde4d3afeac0384b28 | |
| parent | 15b4c403141e9477fb9c6c89f6ad4d27c6207a43 (diff) | |
| download | fuzzycat-2f948cfbb484241178fa7e8c7abd8b0c40a9db24.tar.gz fuzzycat-2f948cfbb484241178fa7e8c7abd8b0c40a9db24.zip | |
cleanup handling: add parameter
allow string cleanup be called directly
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | fuzzycat/__init__.py | 1 | ||||
| -rw-r--r-- | fuzzycat/issn.py | 40 | ||||
| -rw-r--r-- | fuzzycat/utils.py | 3 | 
4 files changed, 26 insertions, 19 deletions
| @@ -130,3 +130,4 @@ dmypy.json  # Data dir  /data +*file.db diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py index 58ccd5f..7feffd5 100644 --- a/fuzzycat/__init__.py +++ b/fuzzycat/__init__.py @@ -3,3 +3,4 @@ __version__ = "0.1.1"  from fuzzycat.matching import match_container_names  from fuzzycat.status import MatchStatus  from fuzzycat.utils import * +from fuzzycat.journals import JournalLookup diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py index 28d2812..46786c9 100644 --- a/fuzzycat/issn.py +++ b/fuzzycat/issn.py @@ -167,9 +167,11 @@ import shelve  import sys  from typing import Dict, Iterable, List, Union +from fuzzycat import cleanups  from fuzzycat.utils import (SetEncoder, StringPipeline, normalize_ampersand, normalize_whitespace) +  def listify(v: Union[str, List[str]]) -> List[str]:      """      Sensible create a list. @@ -272,16 +274,7 @@ def de_jsonld(lines: Iterable):              print(json.dumps(doc, cls=SetEncoder)) -# These transformations should not affect the name or a journal. -cleanup_pipeline = StringPipeline([ -    str.lower, -    normalize_whitespace, -    normalize_ampersand, -    lambda v: v.rstrip("."), -]) - - -def generate_name_pairs(lines: Iterable, cleanup_pipeline=cleanup_pipeline): +def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):      """      Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on      errors. Proto unit test data. @@ -314,29 +307,30 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=cleanup_pipeline):              print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)              continue          for a, b in itertools.combinations(doc.get("names", []), 2): -            a = cleanup_pipeline.run(a) -            b = cleanup_pipeline.run(b) +            if cleanup_pipeline: +                a = cleanup_pipeline(a) +                b = cleanup_pipeline(b)              yield (doc["issnl"], a, b) -def generate_name_issn_mapping(lines: Iterable): +def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):      """      Given JSON lines, generate a dictionary mapping names sets of ISSN. Names      might be reused.      """      mapping = collections.defaultdict(set) -    for issnl, a, b in generate_name_pairs(lines): +    for issnl, a, b in generate_name_pairs(lines, cleanup_pipeline=cleanup_pipeline):          mapping[a].add(issnl)          mapping[b].add(issnl)      return mapping -def generate_shelve(lines: Iterable, output: str): +def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None):      """      Generate a persistent key value store from name issn mappings.      """      with shelve.open(output) as db: -        for name, issnls in generate_name_issn_mapping(lines).items(): +        for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items():              db[name] = issnls          print("wrote {} keys to {}".format(len(db), output), file=sys.stderr) @@ -361,16 +355,24 @@ def main():                          type=str,                          default="output.file",                          help="write output to file") +    parser.add_argument("-c", +                        "--cleanup", +                        type=str, +                        default=None, +                        help="cleanup pipeline name")      parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld")      args = parser.parse_args() +    # Map more cleanup routines. +    cleanup = dict(basic=cleanups.basic).get(args.cleanup) +      if args.make_mapping: -        print(json.dumps(generate_name_issn_mapping(args.file), cls=SetEncoder)) +        print(json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup), cls=SetEncoder))      if args.make_pairs: -        for issn, a, b in generate_name_pairs(args.file): +        for issn, a, b in generate_name_pairs(args.file, cleanup_pipeline=cleanup):              print("{}\t{}\t{}".format(issn, a, b))      if args.de_jsonld:          de_jsonld(args.file)      if args.make_shelve: -        generate_shelve(args.file, output=args.output) +        generate_shelve(args.file, output=args.output, cleanup_pipeline=cleanup) diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 97125ce..ab693eb 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -44,6 +44,9 @@ class StringPipeline:      def __init__(self, fs: List[Callable[[str], str]]):          self.fs = fs +    def __call__(self, s: str) -> str: +        return self.run(s) +      def run(self, s: str) -> str:          """          Apply all function and return result. | 
