diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-02-20 19:58:29 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-02-20 19:58:29 -0800 |
commit | 46e67e77cf7709dae7375a22a6fed1ac730355bf (patch) | |
tree | ea8ae53d57d3dc2e9cb2c6c0efa5359ef9f8d773 /python/fatcat_web/search.py | |
parent | 7c04b83a6612b7a0c87afe4a1ed4bbb65568fea1 (diff) | |
download | fatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.tar.gz fatcat-46e67e77cf7709dae7375a22a6fed1ac730355bf.zip |
refactor entity searches
Diffstat (limited to 'python/fatcat_web/search.py')
-rw-r--r-- | python/fatcat_web/search.py | 116 |
1 files changed, 52 insertions, 64 deletions
diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index a301fcb5..d18416d6 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -10,33 +10,18 @@ the formal API) TODO: ELASTICSEARCH_*_INDEX should probably be factored out and just hard-coded """ -def do_release_search(q, limit=50, fulltext_only=True): - #print("Search hit: " + q) +def do_search(index, request, limit=30): + if limit > 100: # Sanity check limit = 100 - if fulltext_only: - q += " in_web:true" - - search_request = { - "query": { - "query_string": { - "query": q, - "default_operator": "AND", - "analyze_wildcard": True, - "lenient": True, - "fields": ["title^5", "contrib_names^2", "container_title"] - }, - }, - "size": int(limit), - } - - #print(search_request) + request["size"] = int(limit) + #print(request) resp = requests.get("%s/%s/_search" % - (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_RELEASE_INDEX']), - json=search_request) + (app.config['ELASTICSEARCH_BACKEND'], index), + json=request) if resp.status_code == 400: print("elasticsearch 400: " + str(resp.content)) @@ -48,73 +33,76 @@ def do_release_search(q, limit=50, fulltext_only=True): abort(resp.status_code) content = resp.json() - #print(content) results = [h['_source'] for h in content['hits']['hits']] for h in results: - # Ensure 'contrib_names' is a list, not a single string - if type(h['contrib_names']) is not list: - h['contrib_names'] = [h['contrib_names'], ] # Handle surrogate strings that elasticsearch returns sometimes, # probably due to mangled data processing in some pipeline. # "Crimes against Unicode"; production workaround for key in h: if type(h[key]) is str: h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] - found = content['hits']['total'] - return {"query": { "q": q }, - "count_returned": len(results), - "count_found": found, + return {"count_returned": len(results), + "count_found": content['hits']['total'], "results": results } -def do_container_search(q, limit=50): + +def do_release_search(q, limit=30, fulltext_only=True): #print("Search hit: " + q) if limit > 100: # Sanity check limit = 100 + # Convert raw DOIs to DOI queries + if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1: + q = 'doi:"{}"'.format(q) + + + if fulltext_only: + q += " in_web:true" + search_request = { "query": { "query_string": { - "query": q, - "default_operator": "AND", - "analyze_wildcard": True, - "lenient": True, - "fields": ["name^5", "publisher"] + "query": q, + "default_operator": "AND", + "analyze_wildcard": True, + "lenient": True, + "fields": ["title^5", "contrib_names^2", "container_title"], }, }, - "size": int(limit), } - #print(search_request) - resp = requests.get("%s/%s/_search" % - (app.config['ELASTICSEARCH_BACKEND'], app.config['ELASTICSEARCH_CONTAINER_INDEX']), - json=search_request) + resp = do_search(app.config['ELASTICSEARCH_RELEASE_INDEX'], search_request) + for h in resp['results']: + # Ensure 'contrib_names' is a list, not a single string + if type(h['contrib_names']) is not list: + h['contrib_names'] = [h['contrib_names'], ] + h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] + resp["query"] = { "q": q } + return resp - if resp.status_code == 400: - print("elasticsearch 400: " + str(resp.content)) - flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content)) - abort(resp.status_code) - elif resp.status_code != 200: - print("elasticsearch non-200 status code: " + str(resp.status_code)) - print(resp.content) - abort(resp.status_code) - content = resp.json() - #print(content) - results = [h['_source'] for h in content['hits']['hits']] - for h in results: - # Handle surrogate strings that elasticsearch returns sometimes, - # probably due to mangled data processing in some pipeline. - # "Crimes against Unicode"; production workaround - for key in h: - if type(h[key]) is str: - h[key] = h[key].encode('utf8', 'ignore').decode('utf8') +def do_container_search(q, limit=30): + + # Convert raw ISSN-L to ISSN-L query + if len(q.split()) == 1 and len(q) == 9 and isdigit(q[0:4]) and q[4] == '-': + q = 'issnl:"{}"'.format(q) + + search_request = { + "query": { + "query_string": { + "query": q, + "default_operator": "AND", + "analyze_wildcard": True, + "lenient": True, + "fields": ["name^5", "publisher"], + }, + }, + } + + resp = do_search(app.config['ELASTICSEARCH_CONTAINER_INDEX'], search_request, limit=limit) + resp["query"] = { "q": q } + return resp - found = content['hits']['total'] - return {"query": { "q": q }, - "count_returned": len(results), - "count_found": found, - "results": results } |