fatcat_covid19/search.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139


"""
Helpers to make elasticsearch queries.

TODO: switch to using elasticsearch-dsl library instead of requests+json.
already have a WIP branch for this in fatcat repo.
"""

import json
import datetime
from flask import abort
from fatcat_covid19.webface import app

import elasticsearch
from elasticsearch_dsl import Search, Q


def generic_search_execute(search, limit=25, offset=0, deep_page_limit=2000):

    # Sanity checks
    if limit > 100:
        limit = 100
    if offset < 0:
        offset = 0
    if offset > deep_page_limit:
        # Avoid deep paging problem.
        offset = deep_page_limit

    search = search[int(offset):int(offset)+int(limit)]

    try:
        resp = search.execute()
    except elasticsearch.exceptions.RequestError as e:
        # this is a "user" error
        print("elasticsearch 400: " + str(e.info))
        #flash("Search query failed to parse; you might need to use quotes.<p><code>{}: {}</code>".format(e.error, e.info['error']['root_cause'][0]['reason']))
        abort(e.status_code)
    except elasticsearch.exceptions.TransportError as e:
        # all other errors
        print("elasticsearch non-200 status code: {}".format(e.info))
        flash("Elasticsearch error: {}".format(e.error))
        abort(e.status_code)

    # convert from objects to python dicts
    results = []
    for h in resp:
        r = h._d_
        #print(json.dumps(h.meta._d_, indent=2))
        r['_highlights'] = []
        if 'highlight' in dir(h.meta):
            highlights = h.meta.highlight._d_
            for k in highlights:
                r['_highlights'] += highlights[k]
        results.append(r)

    for h in results:
        # Handle surrogate strings that elasticsearch returns sometimes,
        # probably due to mangled data processing in some pipeline.
        # "Crimes against Unicode"; production workaround
        for key in h:
            if type(h[key]) is str:
                h[key] = h[key].encode('utf8', 'ignore').decode('utf8')

    return {
        "count_returned": len(results),
        "count_found": int(resp.hits.total),
        "results": results,
        "offset": offset,
        "limit": limit,
        "deep_page_limit": deep_page_limit,
        "query_time_ms": int(resp.took),
    }

def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None):

    # Convert raw DOIs to DOI queries
    if len(q.split()) == 1 and q.startswith("10.") and q.count("/") >= 1:
        q = 'doi:"{}"'.format(q)

    search = Search(using=app.es_client, index=app.config['ELASTICSEARCH_FULLTEXT_INDEX'])

    # type filters
    if filter_type == "papers":
        search = search.filter("terms", release_type=[ "article-journal", "paper-conference", "chapter", ])
    elif filter_type == "reports":
        search = search.filter("terms", release_type=[ "report", "standard", ])
    elif filter_type == "datasets":
        search = search.filter("terms", release_type=[ "dataset", "software", ])
    elif filter_type == "everything" or filter_type == None:
        pass
    else:
        abort(400)

    # time filters
    if filter_time == "past_week":
        week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7))
        search = search.filter("range", release_date=dict(gte=week_ago_date))
    elif filter_time == "this_year":
        search = search.filter("term", release_year=datetime.date.today().year)
    elif filter_time == "since_2000":
        search = search.filter("range", release_year=dict(gte=2000))
    elif filter_time == "before_1925":
        search = search.filter("range", release_year=dict(lte=1924))
    elif filter_time == "all_time" or filter_time == None:
        pass
    else:
        abort(400)

    search = search.query(
        'query_string',
        query=q,
        default_operator="AND",
        analyze_wildcard=True,
        lenient=True,
        fields=[
            "everything",
            "abstract",
            "fulltext.body",
            "fulltext.annex",
        ],
    )
    search = search.highlight(
        "abstract",
        "fulltext.body",
        "fulltext.annex",
        number_of_fragments=3,
        fragment_size=150,
    )

    resp = generic_search_execute(search, offset=offset)

    for h in resp['results']:
        # Ensure 'contrib_names' is a list, not a single string
        if type(h['contrib_names']) is not list:
            h['contrib_names'] = [h['contrib_names'], ]
        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]

    resp["query"] = { "q": q }
    return resp