fatcat_covid19/search.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117


"""
Helpers to make elasticsearch queries.

TODO: switch to using elasticsearch-dsl library instead of requests+json.
already have a WIP branch for this in fatcat repo.
"""

import json
import datetime
import requests
from flask import abort, flash
from fatcat_covid19.webface import app

def do_search(index, request, limit=30, offset=0, deep_page_limit=2000):

    # Sanity checks
    if limit > 100:
        limit = 100
    if offset < 0:
        offset = 0
    if offset > deep_page_limit:
        # Avoid deep paging problem.
        offset = deep_page_limit

    request["size"] = int(limit)
    request["from"] = int(offset)
    # print(request)
    resp = requests.get("%s/%s/_search" %
            (app.config['ELASTICSEARCH_BACKEND'], index),
        json=request)

    if resp.status_code == 400:
        print("elasticsearch 400: " + str(resp.content))
        #flash("Search query failed to parse; you might need to use quotes.<p><code>{}</code>".format(resp.content))
        abort(resp.status_code)
    elif resp.status_code != 200:
        print("elasticsearch non-200 status code: " + str(resp.status_code))
        print(resp.content)
        abort(resp.status_code)

    content = resp.json()
    #print(json.dumps(content, indent=2))
    results = []
    for h in content['hits']['hits']:
        r = h['_source']
        r['_highlights'] = []
        highlights = h.get('highlight', {})
        for k in highlights:
            r['_highlights'] += highlights[k]
        results.append(r)
    #print(json.dumps(results, indent=2))
    for h in results:
        # Handle surrogate strings that elasticsearch returns sometimes,
        # probably due to mangled data processing in some pipeline.
        # "Crimes against Unicode"; production workaround
        for key in h:
            if type(h[key]) is str:
                h[key] = h[key].encode('utf8', 'ignore').decode('utf8')

    return {"count_returned": len(results),
            "count_found": content['hits']['total'],
            "results": results,
            "offset": offset,
            "deep_page_limit": deep_page_limit}

def do_fulltext_search(q, limit=30, offset=0):

    #print("Search hit: " + q)
    if limit > 100:
        # Sanity check
        limit = 100

    # Convert raw DOIs to DOI queries
    if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
        q = 'doi:"{}"'.format(q)


    search_request = {
        "query": {
            "query_string": {
                "query": q,
                "default_operator": "AND",
                "analyze_wildcard": True,
                "lenient": True,
                "fields": [
                    "everything",
                    "abstract",
                    "fulltext.body",
                    "fulltext.annex",
                ],
            },
        },
        "highlight" : {
            "number_of_fragments" : 3,
            "fragment_size" : 150,
            "fields" : {
                "abstract": { },
                "fulltext.body": { },
                "fulltext.annex": { },
                #"everything": { "number_of_fragments" : 3 },
                #"fulltext.abstract": { "number_of_fragments" : 3 },
                #"fulltext.body":     { "number_of_fragments" : 3 },
                #"fulltext.annex":    { "number_of_fragments" : 3 },
            },
        },
    }

    resp = do_search(app.config['ELASTICSEARCH_FULLTEXT_INDEX'], search_request, offset=offset)
    for h in resp['results']:
        # Ensure 'contrib_names' is a list, not a single string
        if type(h['contrib_names']) is not list:
            h['contrib_names'] = [h['contrib_names'], ]
        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
    resp["query"] = { "q": q }
    resp["limit"] = limit
    return resp