summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/query_fatcat.py
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar/query_fatcat.py')
-rw-r--r--fatcat_scholar/query_fatcat.py117
1 files changed, 117 insertions, 0 deletions
diff --git a/fatcat_scholar/query_fatcat.py b/fatcat_scholar/query_fatcat.py
new file mode 100644
index 0000000..c2b28a0
--- /dev/null
+++ b/fatcat_scholar/query_fatcat.py
@@ -0,0 +1,117 @@
+import os
+import sys
+import json
+import argparse
+from typing import List, Any
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+import elasticsearch
+from elasticsearch_dsl import Search, Q
+
+
+def requests_retry_session(
+ retries: int = 2,
+ backoff_factor: int = 3,
+ status_forcelist: List[int] = [500, 502, 504],
+) -> requests.Session:
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+ return session
+
+
+def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None:
+ """
+ Queries fatcat search index (the full regular fatcat.wiki release index)
+ for search string passed (and some filters), iterates over the result set
+ (using scroll), and fetches full release entity (via api.fatcat.wik) for
+ each.
+
+ TODO: group by work_id
+ """
+ api_session = requests_retry_session()
+
+ es_backend = os.environ.get(
+ "ELASTICSEARCH_BACKEND", default="https://search.fatcat.wiki",
+ )
+ es_index = "fatcat_release"
+ es_client = elasticsearch.Elasticsearch(es_backend)
+
+ search = Search(using=es_client, index=es_index)
+
+ search = search.exclude("terms", release_type=["stub", "component", "abstract"])
+
+ # "Emerald Expert Briefings"
+ search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"])
+
+ # ResearchGate
+ search = search.exclude("terms", doi_prefix=["10.13140"])
+
+ if fulltext_only:
+ search = search.filter("terms", in_ia=True)
+
+ search = search.query(
+ Q("query_string", query=query, default_operator="AND", fields=["biblio"])
+ )
+
+ print("Expecting {} search hits".format(search.count()), file=sys.stderr)
+
+ search = search.params(clear_scroll=False)
+ search = search.params(_source=False)
+
+ results = search.scan()
+ for hit in results:
+ release_id = hit.meta.id
+ resp = api_session.get(
+ "https://api.fatcat.wiki/v0/release/{}".format(release_id),
+ params={
+ "expand": "container,files,filesets,webcaptures",
+ "hide": "references",
+ },
+ )
+ resp.raise_for_status()
+ row = dict(
+ fatcat_hit=hit.meta._d_, release_id=release_id, fatcat_release=resp.json(),
+ )
+ print(json.dumps(row, sort_keys=True), file=json_output)
+
+
+def main() -> None:
+ """
+ Run this command like:
+
+ python -m fatcat_scholar.query_fatcat
+ """
+
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
+ parser.add_argument(
+ "query", help="base query string to use", type=str,
+ )
+ parser.add_argument(
+ "--fulltext-only",
+ help="flag to filter to only documents with fulltext available",
+ action="store_true",
+ )
+
+ args = parser.parse_args()
+
+ run_query_fatcat(args.query, args.fulltext_only, sys.stdout)
+
+
+if __name__ == "__main__":
+ main()