aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-11 00:29:31 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-11 00:29:31 +0100
commitfb9f734db3ec0619188488ba1e37acfcc121113e (patch)
tree071394cad657767c414df3519fdda369bc6ab2f7
parenta871e19c2e0aa3c94e338a27f4cc73b76d8ff9c0 (diff)
downloadfuzzycat-fb9f734db3ec0619188488ba1e37acfcc121113e.tar.gz
fuzzycat-fb9f734db3ec0619188488ba1e37acfcc121113e.zip
verify stub
-rw-r--r--fuzzycat/main.py10
-rw-r--r--fuzzycat/verify.py19
2 files changed, 8 insertions, 21 deletions
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index d2cdf4d..c7ba23d 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -15,9 +15,9 @@ import cProfile as profile
import io
import logging
import pstats
-# import json
import sys
import tempfile
+import fileinput
import orjson as json
@@ -25,7 +25,6 @@ from fuzzycat.build import NgramLookup, TitleTokenList
from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
release_key_title_nysiis, release_key_title_ngram)
-
def run_cluster(args):
logger = logging.getLogger('main.run_cluster')
types = {
@@ -44,9 +43,11 @@ def run_cluster(args):
def run_verify(args):
"""
- TODO.
+ TODO. Ok, we should not fetch data we have on disk (at the clustering
+ step).
"""
- print('verify')
+ for line in fileinput.input(files=args.files):
+ pass
def run_build(args):
@@ -88,6 +89,7 @@ if __name__ == '__main__':
help='cluster algorithm: title, tnorm, tnysi, tss')
sub_verify = subparsers.add_parser('verify', help='verify groups', parents=[parser])
+ sub_verify.add_argument('-f', '--files', default="-", help='input files')
sub_verify.set_defaults(func=run_verify)
sub_build = subparsers.add_parser('build', help='build auxiliary databases', parents=[parser])
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index a9cc799..841df49 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -21,22 +21,7 @@ Further steps:
* fetch all releases, this might be via API, search index, some local key value
store, or some other cache
* apply various rules, return match status
+* alternatively: have a few more fields in the intermediate representation (to
+keep operation local)
"""
-
-import requests
-
-
-def fetch_release_entity(ident, api="https://api.fatcat.wiki/v0"):
- """
- Fetches a single release entity.
- """
- link = "https://api.fatcat.wiki/v0/release/{}".format(ident)
- return requests.get(link).json()
-
-
-def ident_to_release_entities(ids):
- """
- Turn a list of ids into release entities.
- """
- return [fetch_release_entity(id) for id in ids]