aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/__main__.py131
-rw-r--r--fuzzycat/matching.py2
-rw-r--r--tests/test_matching.py15
3 files changed, 130 insertions, 18 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index cf29eaa..45c41d3 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -1,15 +1,47 @@
#!/usr/bin/env python
"""Usage: fuzzycat COMMAND [options]
-Commands: cluster, verify, verify-single
+COMMANDS
-Run, e.g. fuzzycat cluster --help for more options. Example:
+ cluster
+ verify
+ verify_single
+ release_match
- $ zstdcat -T0 release_export_expanded.json.zst |
- parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
- python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
+ Run, e.g. fuzzycat cluster --help for more options.
+
+EXAMPLES
+
+ Clustering with GNU parallel.
+
+ $ zstdcat -T0 release_export_expanded.json.zst |
+ parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
+ python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
+
+ Bulk verification.
+
+ $ zstdcat -T0 cluster_tsandcrawler.json.zst |
+ python -m fuzzycat verify | zstd -c9 > verify.tsv.zst
+
+ Verify a randomly selected pair.
+
+ $ python -m fuzzycat verify-single | jq .
+ {
+ "extra": {
+ "q": "https://fatcat.wiki/release/search?q=processes"
+ },
+ "a": "https://fatcat.wiki/release/r7c33wa4frhx3lgzb3jejd7ijm",
+ "b": "https://fatcat.wiki/release/g6uqzmnt3zgald6blizi6x2wz4",
+ "r": [
+ "different",
+ "num_diff"
+ ]
+ }
+
+ Release match (non-bulk).
+
+ $ python -m fuzzycat release_match -q "hello world"
-TODO: add docs.
"""
import argparse
@@ -28,14 +60,20 @@ import requests
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
release_key_title_normalized, release_key_title_nysiis,
release_key_title_sandcrawler)
+from fuzzycat.entities import entity_to_dict
+from fuzzycat.matching import anything_to_entity, match_release_fuzzy
from fuzzycat.utils import random_idents_from_query, random_word
from fuzzycat.verify import GroupVerifier, verify
+from fatcat_openapi_client import ReleaseEntity
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
def run_cluster(args):
+ """
+ Run clustering over release entities from database dump.
+ """
logger = logging.getLogger('main.run_cluster')
types = {
'title': release_key_title,
@@ -59,29 +97,46 @@ def run_cluster(args):
def run_verify(args):
"""
- TODO. Ok, we should not fetch data we have on disk (at the clustering
- step).
+ Run match verification over dataset from clustering step.
"""
- gv = GroupVerifier(iterable=fileinput.input(files=args.files))
- gv.run()
+ GroupVerifier(iterable=fileinput.input(files=args.files)).run()
def run_verify_single(args):
"""
- Run a single verification on a pair.
+ Run a single verification on a pair (or on a random pair, if none given).
+
+ $ python -m fuzzycat verify-single | jq .
+ {
+ "extra": {
+ "q": "https://fatcat.wiki/release/search?q=processes"
+ },
+ "a": "https://fatcat.wiki/release/r7c33wa4frhx3lgzb3jejd7ijm",
+ "b": "https://fatcat.wiki/release/g6uqzmnt3zgald6blizi6x2wz4",
+ "r": [
+ "different",
+ "num_diff"
+ ]
+ }
+
"""
result = {}
if args.a and args.b:
a, b = args.a, args.b
elif not args.a and not args.b:
for _ in range(10):
+ # We try a few times, since not all random words might yield
+ # results.
word = random_word(wordsfile='/usr/share/dict/words')
try:
idents = random_idents_from_query(query=word, r=2)
+ result.update(
+ {"extra": {
+ "q": "https://fatcat.wiki/release/search?q={}".format(word)
+ }})
+ a, b = idents
except RuntimeError:
continue
- result.update({"extra": {"q": "https://fatcat.wiki/release/search?q={}".format(word)}})
- a, b = idents
break
else:
raise RuntimeError('could not fetch random releases')
@@ -99,6 +154,34 @@ def run_verify_single(args):
print(json.dumps(result))
+def run_release_match(args):
+ """
+ Given a release, return similar releases.
+ """
+ try:
+ entity = anything_to_entity(args.value, ReleaseEntity)
+ result = match_release_fuzzy(entity, size=args.size, es=args.es_url)
+ except Exception as err:
+ print("fuzzy match failed: {}".format(err), file=sys.stderr)
+ else:
+ if args.output_format == "tsv":
+ for ce in result:
+ vs = [ce.ident, ce.work_id, ce.container_id, ce.title]
+ print("\t".join((str(v) for v in vs)))
+ if args.output_format == "json":
+ matches = []
+ for ce in result:
+ vs = {
+ "ident": ce.ident,
+ "work_id": ce.work_id,
+ "container_id": ce.container_id,
+ "title": ce.title,
+ }
+ matches.append(vs)
+ vs = {"entity": entity_to_dict(entity), "matches": matches, "match_count": len(matches)}
+ print(json.dumps(vs))
+
+
if __name__ == '__main__':
logging.basicConfig(
level=logging.DEBUG,
@@ -114,6 +197,14 @@ if __name__ == '__main__':
parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory')
parser.add_argument('-P', '--profile', action='store_true', help='profile program')
+ parser.add_argument("--es-url",
+ default="https://search.fatcat.wiki",
+ help="fatcat elasticsearch")
+ parser.add_argument("-m",
+ "--output-format",
+ help="output format, e.g. tsv or json",
+ default="tsv")
+ parser.add_argument("-s", "--size", help="number of results to return", default=5, type=int)
subparsers = parser.add_subparsers()
sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
@@ -140,6 +231,20 @@ if __name__ == '__main__':
sub_verify_single.add_argument('-b', help='ident or url to release')
sub_verify_single.set_defaults(func=run_verify_single)
+ sub_release_match = subparsers.add_parser(
+ "release_match",
+ help="find release matches",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ parents=[parser])
+
+ sub_release_match.add_argument(
+ "--value",
+ help="release title, issn, QID, filename to entity JSON, or JSON lines",
+ default="hello world",
+ type=str,
+ )
+ sub_release_match.set_defaults(func=run_release_match)
+
args = parser.parse_args()
if not args.__dict__.get("func"):
print(__doc__, file=sys.stderr)
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index b248024..48cc397 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -1,6 +1,6 @@
import os
import re
-from typing import List, Union, Type
+from typing import List, Type, Union
import elasticsearch
import elasticsearch_dsl
diff --git a/tests/test_matching.py b/tests/test_matching.py
index 927b383..cded2c8 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -2,18 +2,25 @@ from fuzzycat.matching import anything_to_entity, match_release_fuzzy
from fatcat_openapi_client import ReleaseEntity
import pytest
import elasticsearch
+import logging
+
+logger = logging.getLogger('test_matching')
+logger.setLevel(logging.DEBUG)
+
@pytest.fixture
def es_client():
- return elasticsearch.Elasticsearch(["https://search.fatcat.wiki:80"])
+ return elasticsearch.Elasticsearch(["https://search.fatcat.wiki:443"])
+
@pytest.mark.skip
def test_match_release_fuzzy(es_client):
- cases = (
- ("wtv64ahbdzgwnan7rllwr3nurm", 2),
- )
+ cases = (("wtv64ahbdzgwnan7rllwr3nurm", 2), )
for case, count in cases:
entity = anything_to_entity(case, ReleaseEntity)
+ logger.info(entity.title)
result = match_release_fuzzy(entity, es=es_client)
+ logger.info("given: {}".format(entity.title))
+ logger.info("found: {}".format(len(result)))
assert len(result) == count