aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-15 18:17:35 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-15 18:17:35 +0200
commit2f948cfbb484241178fa7e8c7abd8b0c40a9db24 (patch)
tree5184838a7daba6e54333dacde4d3afeac0384b28
parent15b4c403141e9477fb9c6c89f6ad4d27c6207a43 (diff)
downloadfuzzycat-2f948cfbb484241178fa7e8c7abd8b0c40a9db24.tar.gz
fuzzycat-2f948cfbb484241178fa7e8c7abd8b0c40a9db24.zip
cleanup handling: add parameter
allow string cleanup be called directly
-rw-r--r--.gitignore1
-rw-r--r--fuzzycat/__init__.py1
-rw-r--r--fuzzycat/issn.py40
-rw-r--r--fuzzycat/utils.py3
4 files changed, 26 insertions, 19 deletions
diff --git a/.gitignore b/.gitignore
index f26a420..36fc0b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,3 +130,4 @@ dmypy.json
# Data dir
/data
+*file.db
diff --git a/fuzzycat/__init__.py b/fuzzycat/__init__.py
index 58ccd5f..7feffd5 100644
--- a/fuzzycat/__init__.py
+++ b/fuzzycat/__init__.py
@@ -3,3 +3,4 @@ __version__ = "0.1.1"
from fuzzycat.matching import match_container_names
from fuzzycat.status import MatchStatus
from fuzzycat.utils import *
+from fuzzycat.journals import JournalLookup
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
index 28d2812..46786c9 100644
--- a/fuzzycat/issn.py
+++ b/fuzzycat/issn.py
@@ -167,9 +167,11 @@ import shelve
import sys
from typing import Dict, Iterable, List, Union
+from fuzzycat import cleanups
from fuzzycat.utils import (SetEncoder, StringPipeline, normalize_ampersand, normalize_whitespace)
+
def listify(v: Union[str, List[str]]) -> List[str]:
"""
Sensible create a list.
@@ -272,16 +274,7 @@ def de_jsonld(lines: Iterable):
print(json.dumps(doc, cls=SetEncoder))
-# These transformations should not affect the name or a journal.
-cleanup_pipeline = StringPipeline([
- str.lower,
- normalize_whitespace,
- normalize_ampersand,
- lambda v: v.rstrip("."),
-])
-
-
-def generate_name_pairs(lines: Iterable, cleanup_pipeline=cleanup_pipeline):
+def generate_name_pairs(lines: Iterable, cleanup_pipeline=None):
"""
Given JSON lines, yield a tuple (issnl, a, b) of test data. Will skip on
errors. Proto unit test data.
@@ -314,29 +307,30 @@ def generate_name_pairs(lines: Iterable, cleanup_pipeline=cleanup_pipeline):
print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
continue
for a, b in itertools.combinations(doc.get("names", []), 2):
- a = cleanup_pipeline.run(a)
- b = cleanup_pipeline.run(b)
+ if cleanup_pipeline:
+ a = cleanup_pipeline(a)
+ b = cleanup_pipeline(b)
yield (doc["issnl"], a, b)
-def generate_name_issn_mapping(lines: Iterable):
+def generate_name_issn_mapping(lines: Iterable, cleanup_pipeline=None):
"""
Given JSON lines, generate a dictionary mapping names sets of ISSN. Names
might be reused.
"""
mapping = collections.defaultdict(set)
- for issnl, a, b in generate_name_pairs(lines):
+ for issnl, a, b in generate_name_pairs(lines, cleanup_pipeline=cleanup_pipeline):
mapping[a].add(issnl)
mapping[b].add(issnl)
return mapping
-def generate_shelve(lines: Iterable, output: str):
+def generate_shelve(lines: Iterable, output: str, cleanup_pipeline=None):
"""
Generate a persistent key value store from name issn mappings.
"""
with shelve.open(output) as db:
- for name, issnls in generate_name_issn_mapping(lines).items():
+ for name, issnls in generate_name_issn_mapping(lines, cleanup_pipeline=cleanup_pipeline).items():
db[name] = issnls
print("wrote {} keys to {}".format(len(db), output), file=sys.stderr)
@@ -361,16 +355,24 @@ def main():
type=str,
default="output.file",
help="write output to file")
+ parser.add_argument("-c",
+ "--cleanup",
+ type=str,
+ default=None,
+ help="cleanup pipeline name")
parser.add_argument("--de-jsonld", action="store_true", help="break up the jsonld")
args = parser.parse_args()
+ # Map more cleanup routines.
+ cleanup = dict(basic=cleanups.basic).get(args.cleanup)
+
if args.make_mapping:
- print(json.dumps(generate_name_issn_mapping(args.file), cls=SetEncoder))
+ print(json.dumps(generate_name_issn_mapping(args.file, cleanup_pipeline=cleanup), cls=SetEncoder))
if args.make_pairs:
- for issn, a, b in generate_name_pairs(args.file):
+ for issn, a, b in generate_name_pairs(args.file, cleanup_pipeline=cleanup):
print("{}\t{}\t{}".format(issn, a, b))
if args.de_jsonld:
de_jsonld(args.file)
if args.make_shelve:
- generate_shelve(args.file, output=args.output)
+ generate_shelve(args.file, output=args.output, cleanup_pipeline=cleanup)
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 97125ce..ab693eb 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -44,6 +44,9 @@ class StringPipeline:
def __init__(self, fs: List[Callable[[str], str]]):
self.fs = fs
+ def __call__(self, s: str) -> str:
+ return self.run(s)
+
def run(self, s: str) -> str:
"""
Apply all function and return result.