From 6eb88820feb50136571487436b515ed6b748e17d Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 16 Dec 2020 02:21:49 +0100
Subject: docs and release match command

---
 fuzzycat/__main__.py   | 131 ++++++++++++++++++++++++++++++++++++++++++++-----
 fuzzycat/matching.py   |   2 +-
 tests/test_matching.py |  15 ++++--
 3 files changed, 130 insertions(+), 18 deletions(-)

diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index cf29eaa..45c41d3 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -1,15 +1,47 @@
 #!/usr/bin/env python
 """Usage: fuzzycat COMMAND [options]
 
-Commands: cluster, verify, verify-single
+COMMANDS
 
-Run, e.g. fuzzycat cluster --help for more options. Example:
+    cluster
+    verify
+    verify_single
+    release_match
 
-    $ zstdcat -T0 release_export_expanded.json.zst |
-      parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
-      python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
+  Run, e.g. fuzzycat cluster --help for more options.
+
+EXAMPLES
+
+  Clustering with GNU parallel.
+
+      $ zstdcat -T0 release_export_expanded.json.zst |
+          parallel --tmpdir /fast/tmp --roundrobin --pipe -j 4 |
+          python -m fuzzycat.main cluster --tmpdir /fast/tmp -t tnorm > clusters.jsonl
+
+  Bulk verification.
+
+      $ zstdcat -T0 cluster_tsandcrawler.json.zst |
+          python -m fuzzycat verify | zstd -c9 > verify.tsv.zst
+
+  Verify a randomly selected pair.
+
+      $ python -m fuzzycat verify-single | jq .
+      {
+        "extra": {
+  	"q": "https://fatcat.wiki/release/search?q=processes"
+        },
+        "a": "https://fatcat.wiki/release/r7c33wa4frhx3lgzb3jejd7ijm",
+        "b": "https://fatcat.wiki/release/g6uqzmnt3zgald6blizi6x2wz4",
+        "r": [
+  	"different",
+  	"num_diff"
+        ]
+      }
+
+  Release match (non-bulk).
+
+      $ python -m fuzzycat release_match -q "hello world"
 
-TODO: add docs.
 """
 
 import argparse
@@ -28,14 +60,20 @@ import requests
 from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_ngram,
                               release_key_title_normalized, release_key_title_nysiis,
                               release_key_title_sandcrawler)
+from fuzzycat.entities import entity_to_dict
+from fuzzycat.matching import anything_to_entity, match_release_fuzzy
 from fuzzycat.utils import random_idents_from_query, random_word
 from fuzzycat.verify import GroupVerifier, verify
+from fatcat_openapi_client import ReleaseEntity
 
 logging.getLogger("requests").setLevel(logging.WARNING)
 logging.getLogger("urllib3").setLevel(logging.WARNING)
 
 
 def run_cluster(args):
+    """
+    Run clustering over release entities from database dump.
+    """
     logger = logging.getLogger('main.run_cluster')
     types = {
         'title': release_key_title,
@@ -59,29 +97,46 @@ def run_cluster(args):
 
 def run_verify(args):
     """
-    TODO. Ok, we should not fetch data we have on disk (at the clustering
-    step).
+    Run match verification over dataset from clustering step.
     """
-    gv = GroupVerifier(iterable=fileinput.input(files=args.files))
-    gv.run()
+    GroupVerifier(iterable=fileinput.input(files=args.files)).run()
 
 
 def run_verify_single(args):
     """
-    Run a single verification on a pair.
+    Run a single verification on a pair (or on a random pair, if none given).
+
+    $ python -m fuzzycat verify-single | jq .
+    {
+      "extra": {
+	"q": "https://fatcat.wiki/release/search?q=processes"
+      },
+      "a": "https://fatcat.wiki/release/r7c33wa4frhx3lgzb3jejd7ijm",
+      "b": "https://fatcat.wiki/release/g6uqzmnt3zgald6blizi6x2wz4",
+      "r": [
+	"different",
+	"num_diff"
+      ]
+    }
+
     """
     result = {}
     if args.a and args.b:
         a, b = args.a, args.b
     elif not args.a and not args.b:
         for _ in range(10):
+            # We try a few times, since not all random words might yield
+            # results.
             word = random_word(wordsfile='/usr/share/dict/words')
             try:
                 idents = random_idents_from_query(query=word, r=2)
+                result.update(
+                    {"extra": {
+                        "q": "https://fatcat.wiki/release/search?q={}".format(word)
+                    }})
+                a, b = idents
             except RuntimeError:
                 continue
-            result.update({"extra": {"q": "https://fatcat.wiki/release/search?q={}".format(word)}})
-            a, b = idents
             break
         else:
             raise RuntimeError('could not fetch random releases')
@@ -99,6 +154,34 @@ def run_verify_single(args):
     print(json.dumps(result))
 
 
+def run_release_match(args):
+    """
+    Given a release, return similar releases.
+    """
+    try:
+        entity = anything_to_entity(args.value, ReleaseEntity)
+        result = match_release_fuzzy(entity, size=args.size, es=args.es_url)
+    except Exception as err:
+        print("fuzzy match failed: {}".format(err), file=sys.stderr)
+    else:
+        if args.output_format == "tsv":
+            for ce in result:
+                vs = [ce.ident, ce.work_id, ce.container_id, ce.title]
+                print("\t".join((str(v) for v in vs)))
+        if args.output_format == "json":
+            matches = []
+            for ce in result:
+                vs = {
+                    "ident": ce.ident,
+                    "work_id": ce.work_id,
+                    "container_id": ce.container_id,
+                    "title": ce.title,
+                }
+                matches.append(vs)
+            vs = {"entity": entity_to_dict(entity), "matches": matches, "match_count": len(matches)}
+            print(json.dumps(vs))
+
+
 if __name__ == '__main__':
     logging.basicConfig(
         level=logging.DEBUG,
@@ -114,6 +197,14 @@ if __name__ == '__main__':
     parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix')
     parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory')
     parser.add_argument('-P', '--profile', action='store_true', help='profile program')
+    parser.add_argument("--es-url",
+                        default="https://search.fatcat.wiki",
+                        help="fatcat elasticsearch")
+    parser.add_argument("-m",
+                        "--output-format",
+                        help="output format, e.g. tsv or json",
+                        default="tsv")
+    parser.add_argument("-s", "--size", help="number of results to return", default=5, type=int)
     subparsers = parser.add_subparsers()
 
     sub_cluster = subparsers.add_parser('cluster', help='group entities', parents=[parser])
@@ -140,6 +231,20 @@ if __name__ == '__main__':
     sub_verify_single.add_argument('-b', help='ident or url to release')
     sub_verify_single.set_defaults(func=run_verify_single)
 
+    sub_release_match = subparsers.add_parser(
+        "release_match",
+        help="find release matches",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[parser])
+
+    sub_release_match.add_argument(
+        "--value",
+        help="release title, issn, QID, filename to entity JSON, or JSON lines",
+        default="hello world",
+        type=str,
+    )
+    sub_release_match.set_defaults(func=run_release_match)
+
     args = parser.parse_args()
     if not args.__dict__.get("func"):
         print(__doc__, file=sys.stderr)
diff --git a/fuzzycat/matching.py b/fuzzycat/matching.py
index b248024..48cc397 100644
--- a/fuzzycat/matching.py
+++ b/fuzzycat/matching.py
@@ -1,6 +1,6 @@
 import os
 import re
-from typing import List, Union, Type
+from typing import List, Type, Union
 
 import elasticsearch
 import elasticsearch_dsl
diff --git a/tests/test_matching.py b/tests/test_matching.py
index 927b383..cded2c8 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -2,18 +2,25 @@ from fuzzycat.matching import anything_to_entity, match_release_fuzzy
 from fatcat_openapi_client import ReleaseEntity
 import pytest
 import elasticsearch
+import logging
+
+logger = logging.getLogger('test_matching')
+logger.setLevel(logging.DEBUG)
+
 
 @pytest.fixture
 def es_client():
-    return elasticsearch.Elasticsearch(["https://search.fatcat.wiki:80"])
+    return elasticsearch.Elasticsearch(["https://search.fatcat.wiki:443"])
+
 
 @pytest.mark.skip
 def test_match_release_fuzzy(es_client):
-    cases = (
-        ("wtv64ahbdzgwnan7rllwr3nurm", 2),
-    )
+    cases = (("wtv64ahbdzgwnan7rllwr3nurm", 2), )
     for case, count in cases:
         entity = anything_to_entity(case, ReleaseEntity)
+        logger.info(entity.title)
 
         result = match_release_fuzzy(entity, es=es_client)
+        logger.info("given: {}".format(entity.title))
+        logger.info("found: {}".format(len(result)))
         assert len(result) == count
-- 
cgit v1.2.3