aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_covid19
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-09 17:49:08 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-09 17:49:08 -0700
commit8b3b3e5892a10bf6748c7824549641d20e2447d7 (patch)
tree493938a53995cf29f5e2f435271c309bd4ce4aa6 /fatcat_covid19
parent042bd36c25206ff45e305d094028b6482a4c4074 (diff)
downloadfatcat-covid19-8b3b3e5892a10bf6748c7824549641d20e2447d7.tar.gz
fatcat-covid19-8b3b3e5892a10bf6748c7824549641d20e2447d7.zip
add dedupe and query-fatcat commands
Diffstat (limited to 'fatcat_covid19')
-rw-r--r--fatcat_covid19/dedupe.py27
-rw-r--r--fatcat_covid19/query.py80
2 files changed, 107 insertions, 0 deletions
diff --git a/fatcat_covid19/dedupe.py b/fatcat_covid19/dedupe.py
new file mode 100644
index 0000000..86cf8fa
--- /dev/null
+++ b/fatcat_covid19/dedupe.py
@@ -0,0 +1,27 @@
+
+import sys
+import json
+import datetime
+
+
+def dedupe_file(json_input, json_output):
+ """
+ Takes JSON file of "fatcat enriched" content, and de-dupes based on the
+ fatcat identifier.
+ """
+ rows = dict()
+ for l in json_input:
+ l = json.loads(l)
+ key = l.get('release_id')
+ if not key:
+ continue
+ if not key in rows:
+ rows[key] = l
+ continue
+ for other_info in ['cord19_paper', 'fatcat_hit',]:
+ if other_info in l:
+ rows[key][other_info] = l[other_info]
+
+ for k in rows:
+ print(json.dumps(rows[k], sort_keys=True), file=json_output)
+
diff --git a/fatcat_covid19/query.py b/fatcat_covid19/query.py
new file mode 100644
index 0000000..4397fc5
--- /dev/null
+++ b/fatcat_covid19/query.py
@@ -0,0 +1,80 @@
+
+import os
+import sys
+import json
+import datetime
+
+import elasticsearch
+from elasticsearch_dsl import Search, Q
+
+from fatcat_covid19.common import requests_retry_session
+
+
+def query_fatcat(json_output):
+ """
+ Queries fatcat search index (the full regular fatcat.wiki release index)
+ for COVID-19 keywords and phrases, iterates over the result set (using
+ scroll), and fetches full release entity (via api.fatcat.wik) for each.
+ """
+ api_session = requests_retry_session()
+
+ es_backend = os.environ.get(
+ "ELASTICSEARCH_BACKEND",
+ default="https://search.fatcat.wiki",
+ )
+ es_index = "fatcat_release"
+ es_client = elasticsearch.Elasticsearch(es_backend)
+
+ search = Search(using=es_client, index=es_index)
+
+ search = search.exclude("terms", release_type=["stub", "component", "abstract"])
+
+ # "Emerald Expert Briefings"
+ search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"])
+
+ # ResearchGate
+ search = search.exclude("terms", doi_prefix=["10.13140"])
+
+ # some industrial thing
+ search = search.exclude("query_string", query='"Report on SARS backfit evaluation"', fields=["title"])
+
+ # physic experiment
+ search = search.exclude("query_string", query='"TOF-SARS"', fields=["title"])
+
+ # species not related to SARS
+ # something based on excluding "lake" in title might be easier?
+ search = search.exclude("query_string", query='"G.O. Sars"', fields=["title"])
+ search = search.exclude("query_string", query='"Gomphocythere Sars"', fields=["title"])
+ search = search.exclude("query_string", query='"Australis Sars"', fields=["title"])
+ search = search.exclude("query_string", query='"scutifer Sars"', fields=["title"])
+ search = search.exclude("query_string", query='"lumholtzi Sars"', fields=["title"])
+
+ search = search.query(
+ Q("query_string", query='"COVID-19" coronavirus coronaviruses "sars-cov-2" "2019-nCoV" "SARS-CoV" "MERS-CoV" SARS', default_operator="OR", fields=["title", "original_title"]) |
+ Q("query_string", query='pandemic influenza', default_operator="AND", fields=["biblio"]) |
+ Q("query_string", query='epidemic influenza', default_operator="AND", fields=["biblio"]) |
+ Q("query_string", query='pandemic ventilator', default_operator="AND", fields=["biblio"])
+ )
+
+ print("Expecting {} search hits".format(search.count()), file=sys.stderr)
+
+ search = search.params(clear_scroll=False)
+ search = search.params(_source=False)
+
+ results = search.scan()
+ for hit in results:
+ release_id = hit.meta.id
+ resp = api_session.get(
+ 'https://api.fatcat.wiki/v0/release/{}'.format(release_id),
+ params={
+ 'expand': 'container,files,filesets,webcaptures',
+ 'hide': 'references',
+ })
+ resp.raise_for_status()
+ row = dict(
+ fatcat_hit=hit.meta._d_,
+ release_id=release_id,
+ fatcat_release=resp.json(),
+ )
+ print(json.dumps(row, sort_keys=True), file=json_output)
+