aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/query_fatcat.py
blob: b63d8341216e62c5e72b9bc4019aa28fe9a21f4b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import argparse
import json
import os
import sys
from typing import Any, List

import elasticsearch
import requests
from elasticsearch_dsl import Q, Search
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error


def requests_retry_session(
    retries: int = 2,
    backoff_factor: int = 3,
    status_forcelist: List[int] = [500, 502, 504],
) -> requests.Session:
    """
    From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
    """
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session


def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None:
    """
    Queries fatcat search index (the full regular fatcat.wiki release index)
    for search string passed (and some filters), iterates over the result set
    (using scroll), and fetches full release entity (via api.fatcat.wik) for
    each.

    TODO: group by work_id
    """
    api_session = requests_retry_session()

    es_backend = os.environ.get(
        "ELASTICSEARCH_FATCAT_BASE", "https://search.fatcat.wiki"
    )
    es_index = os.environ.get("ELASTICSEARCH_FATCAT_RELEASE_INDEX", "fatcat_release")
    es_client = elasticsearch.Elasticsearch(es_backend)

    search = Search(using=es_client, index=es_index)

    search = search.exclude("terms", release_type=["stub", "component", "abstract"])

    # "Emerald Expert Briefings"
    search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"])

    # ResearchGate
    search = search.exclude("terms", doi_prefix=["10.13140"])

    if fulltext_only:
        search = search.filter("terms", in_ia=True)

    search = search.query(
        Q("query_string", query=query, default_operator="AND", fields=["biblio"])
    )

    print(f"Expecting {search.count()} search hits", file=sys.stderr)

    search = search.params(clear_scroll=False)
    search = search.params(_source=False)

    results = search.scan()
    for hit in results:
        release_id = hit.meta.id
        resp = api_session.get(
            f"https://api.fatcat.wiki/v0/release/{release_id}",
            params={
                "expand": "container,files,filesets,webcaptures",
                "hide": "references",
            },
        )
        resp.raise_for_status()
        row = dict(
            fatcat_hit=hit.meta._d_,
            release_id=release_id,
            fatcat_release=resp.json(),
        )
        print(json.dumps(row, sort_keys=True), file=json_output)


def main() -> None:
    """
    Run this command like:

        python -m fatcat_scholar.query_fatcat
    """

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "query",
        help="base query string to use",
        type=str,
    )
    parser.add_argument(
        "--fulltext-only",
        help="flag to filter to only documents with fulltext available",
        action="store_true",
    )

    args = parser.parse_args()

    run_query_fatcat(args.query, args.fulltext_only, sys.stdout)


if __name__ == "__main__":
    main()