aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-08-12 14:20:41 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-08-12 14:24:31 +0200
commit5a307829670888fedd696e6220c84feed1fe6b64 (patch)
tree87e5046442ad95239c1f60982a191ceb1d8b1c9f /fuzzycat
parentf96c3d0d025ad37836eb908d561b0c607a1f7b5e (diff)
downloadfuzzycat-5a307829670888fedd696e6220c84feed1fe6b64.tar.gz
fuzzycat-5a307829670888fedd696e6220c84feed1fe6b64.zip
stub tool: fuzzycat-issn to generate test data
currently: fuzzycat-issn --make-pairs will generate a TSV with (issn, a, b) example, e.g. ... 0011-9717 Detskaâ literatura. Детская литература. 0011-9717 Detskaâ literatura. Detskaâ literatura 0011-9717 Детская литература. Detskaâ literatura 0011-6637 Darbininkas. Darbininkas 0012-0820 deutsche Tabakbau deutsche Tabakbau. 0011-5444 Daily Kent stater. Daily Kent stater ... The idea is that these names per definition denote the same journal. We might even have a fixed lookup table, since some variants involve multiple scripts (and there are only around 2M names in total). Currently 1992176 pairs can be generated.
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/issn.py69
1 files changed, 69 insertions, 0 deletions
diff --git a/fuzzycat/issn.py b/fuzzycat/issn.py
new file mode 100644
index 0000000..8108885
--- /dev/null
+++ b/fuzzycat/issn.py
@@ -0,0 +1,69 @@
+"""
+Munge the ISSN data so we get some container name test data out of it.
+
+ ...
+ "issn": "0000-0019",
+ "mainTitle": "<U+0098>The <U+009C>publishers weekly.",
+ "name": [
+ "<U+0098>The <U+009C>publishers weekly.",
+ "Publishers weekly"
+ ],
+
+Public data from ISSN via:
+https://portal.issn.org/resource/ISSN/0874-2308?format=json, and issnlister
+(https://github.com/miku/issnlister) to aggregate.
+
+The dataset contains naming variants in "name".
+
+Q1: How many of these variants would our matching algorithm detect?
+
+For that, we need a dataset that generates pairs (a, b) from all names (the
+mainTitle is just one of the name).
+
+"""
+
+import argparse
+import sys
+import os
+import json
+import itertools
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("file",
+ default=sys.stdin,
+ type=argparse.FileType("r"),
+ help="public data from issn, one JSON object per line")
+ parser.add_argument("--make-pairs", action="store_true")
+
+ args = parser.parse_args()
+
+ if args.make_pairs:
+ for line in args.file:
+ line = line.strip()
+ try:
+ doc = json.loads(line)
+ except json.decoder.JSONDecodeError as exc:
+ print("failed to parse json: {}, data: {}".format(exc, line), file=sys.stderr)
+ continue
+ for item in doc.get("@graph", []):
+ issn = item.get("issn", "")
+ if not issn:
+ continue
+ if len(issn) != 9:
+ continue
+ if issn[4] != "-":
+ continue
+ names = item.get("name")
+ if not names:
+ continue
+ if isinstance(names, str):
+ names = [names]
+ if not isinstance(names, list):
+ raise ValueError("expected a list: {} {}".format(names, type(names)))
+ if len(names) < 2:
+ continue
+
+ for a, b in itertools.combinations(names, 2):
+ print("{}\t{}\t{}".format(issn, a, b))