refactor python modules

author: Bryan Newbold <bnewbold@robocracy.org> 2018-11-12 23:18:56 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-11-12 23:18:56 -0800
commit: b03bfc8f3fd84141738f775b273a99850d78e1ff (patch)
tree: 64858e474fa38aa015f06f5e15b851dcc85da421 /python/fatcat_web/search.py
parent: 055c464deea8cdaccf3ed384995d4409b0f51409 (diff)
download: fatcat-b03bfc8f3fd84141738f775b273a99850d78e1ff.tar.gz
fatcat-b03bfc8f3fd84141738f775b273a99850d78e1ff.zip
1 files changed, 60 insertions, 0 deletions
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py
new file mode 100644
index 00000000..b6826110
--- /dev/null
+++ b/python/fatcat_web/search.py
@@ -0,0 +1,60 @@
+
+import requests
+from flask import abort
+from fatcat import app
+
+
+def do_search(q, limit=50, fulltext_only=True):
+
+    #print("Search hit: " + q)
+    if limit > 100:
+        # Sanity check
+        limit = 100
+
+    if fulltext_only:
+        q += " file_in_ia:true"
+
+    search_request = {
+        "query": {
+            "query_string": {
+            "query": q,
+            "analyzer": "textIcuSearch",
+            "default_operator": "AND",
+            "analyze_wildcard": True,
+            "lenient": True,
+            "fields": ["title^5", "contrib_names^2", "container_title"]
+            },
+        },
+        "size": int(limit),
+    }
+
+    #print(search_request)
+    resp = requests.get("%s/%s/_search" %
+            (app.config['ELASTIC_BACKEND'], app.config['ELASTIC_INDEX']),
+        json=search_request)
+
+    if resp.status_code != 200:
+        print("elasticsearch non-200 status code: " + str(resp.status_code))
+        print(resp.content)
+        abort(resp.status_code)
+
+    content = resp.json()
+    #print(content)
+    results = [h['_source'] for h in content['hits']['hits']]
+    for h in results:
+        # Ensure 'contrib_names' is a list, not a single string
+        if type(h['contrib_names']) is not list:
+            h['contrib_names'] = [h['contrib_names'], ]
+        # Handle surrogate strings that elasticsearch returns sometimes,
+        # probably due to mangled data processing in some pipeline.
+        # "Crimes against Unicode"; production workaround
+        for key in h:
+            if type(h[key]) is str:
+                h[key] = h[key].encode('utf8', 'ignore').decode('utf8')
+        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
+
+    found = content['hits']['total']
+    return {"query": { "q": q },
+            "count_returned": len(results),
+            "count_found": found,
+            "results": results }
author	Bryan Newbold <bnewbold@robocracy.org>	2018-11-12 23:18:56 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-11-12 23:18:56 -0800
commit	b03bfc8f3fd84141738f775b273a99850d78e1ff (patch)
tree	64858e474fa38aa015f06f5e15b851dcc85da421 /python/fatcat_web/search.py
parent	055c464deea8cdaccf3ed384995d4409b0f51409 (diff)
download	fatcat-b03bfc8f3fd84141738f775b273a99850d78e1ff.tar.gz fatcat-b03bfc8f3fd84141738f775b273a99850d78e1ff.zip