aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_covid19/query.py
blob: 4397fc53b962b8396e9a4491d13f55d709b5d01a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

import os
import sys
import json
import datetime

import elasticsearch
from elasticsearch_dsl import Search, Q

from fatcat_covid19.common import requests_retry_session


def query_fatcat(json_output):
    """
    Queries fatcat search index (the full regular fatcat.wiki release index)
    for COVID-19 keywords and phrases, iterates over the result set (using
    scroll), and fetches full release entity (via api.fatcat.wik) for each.
    """
    api_session = requests_retry_session()

    es_backend = os.environ.get(
        "ELASTICSEARCH_BACKEND",
        default="https://search.fatcat.wiki",
    )
    es_index = "fatcat_release"
    es_client = elasticsearch.Elasticsearch(es_backend)

    search = Search(using=es_client, index=es_index)

    search = search.exclude("terms", release_type=["stub", "component", "abstract"])

    # "Emerald Expert Briefings"
    search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"])

    # ResearchGate
    search = search.exclude("terms", doi_prefix=["10.13140"])

    # some industrial thing
    search = search.exclude("query_string", query='"Report on SARS backfit evaluation"', fields=["title"])

    # physic experiment
    search = search.exclude("query_string", query='"TOF-SARS"', fields=["title"])

    # species not related to SARS
    # something based on excluding "lake" in title might be easier?
    search = search.exclude("query_string", query='"G.O. Sars"', fields=["title"])
    search = search.exclude("query_string", query='"Gomphocythere Sars"', fields=["title"])
    search = search.exclude("query_string", query='"Australis Sars"', fields=["title"])
    search = search.exclude("query_string", query='"scutifer Sars"', fields=["title"])
    search = search.exclude("query_string", query='"lumholtzi Sars"', fields=["title"])

    search = search.query(
        Q("query_string", query='"COVID-19" coronavirus coronaviruses "sars-cov-2" "2019-nCoV" "SARS-CoV" "MERS-CoV" SARS', default_operator="OR", fields=["title", "original_title"]) |
        Q("query_string", query='pandemic influenza', default_operator="AND", fields=["biblio"]) |
        Q("query_string", query='epidemic influenza', default_operator="AND", fields=["biblio"]) |
        Q("query_string", query='pandemic ventilator', default_operator="AND", fields=["biblio"])
    )

    print("Expecting {} search hits".format(search.count()), file=sys.stderr)

    search = search.params(clear_scroll=False)
    search = search.params(_source=False)

    results = search.scan()
    for hit in results:
        release_id = hit.meta.id
        resp = api_session.get(
            'https://api.fatcat.wiki/v0/release/{}'.format(release_id),
            params={
                'expand': 'container,files,filesets,webcaptures',
                'hide': 'references',
        })
        resp.raise_for_status()
        row = dict(
            fatcat_hit=hit.meta._d_,
            release_id=release_id,
            fatcat_release=resp.json(),
        )
        print(json.dumps(row, sort_keys=True), file=json_output)