indexing tweaks

author: Bryan Newbold <bnewbold@archive.org> 2020-05-20 17:04:26 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-05-20 17:04:26 -0700
commit: a3875b4715f957be86049b4d90d43b0ac5af1b75 (patch)
tree: 6d9630aae2734e978c6229ed38dd55de32c83fc7 /fatcat_scholar
parent: 7597c6192dfe568d72fa26c9666b2a9a40b37a6f (diff)
download: fatcat-scholar-a3875b4715f957be86049b4d90d43b0ac5af1b75.tar.gz
fatcat-scholar-a3875b4715f957be86049b4d90d43b0ac5af1b75.zip
2 files changed, 11 insertions, 16 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 2373245..08eadae 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -79,11 +79,11 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None
 
     # type filters
     if filter_type == "papers":
-        search = search.filter("terms", release_type=[ "article-journal", "paper-conference", "chapter", ])
+        search = search.filter("terms", type=[ "article-journal", "paper-conference", "chapter", ])
     elif filter_type == "reports":
-        search = search.filter("terms", release_type=[ "report", "standard", ])
+        search = search.filter("terms", type=[ "report", "standard", ])
     elif filter_type == "datasets":
-        search = search.filter("terms", release_type=[ "dataset", "software", ])
+        search = search.filter("terms", type=[ "dataset", "software", ])
     elif filter_type == "everything" or filter_type == None:
         pass
     else:
@@ -93,13 +93,13 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None
     # time filters
     if filter_time == "past_week":
         week_ago_date = str(datetime.date.today() - datetime.timedelta(days=7))
-        search = search.filter("range", release_date=dict(gte=week_ago_date))
+        search = search.filter("range", date=dict(gte=week_ago_date))
     elif filter_time == "this_year":
-        search = search.filter("term", release_year=datetime.date.today().year)
+        search = search.filter("term", year=datetime.date.today().year)
     elif filter_time == "since_2000":
-        search = search.filter("range", release_year=dict(gte=2000))
+        search = search.filter("range", year=dict(gte=2000))
     elif filter_time == "before_1925":
-        search = search.filter("range", release_year=dict(lte=1924))
+        search = search.filter("range", year=dict(lt=1925))
     elif filter_time == "all_time" or filter_time == None:
         pass
     else:
@@ -129,11 +129,5 @@ def do_fulltext_search(q, limit=25, offset=0, filter_time=None, filter_type=None
 
     resp = generic_search_execute(search, offset=offset)
 
-    for h in resp['results']:
-        # Ensure 'contrib_names' is a list, not a single string
-        if type(h['contrib_names']) is not list:
-            h['contrib_names'] = [h['contrib_names'], ]
-        h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']]
-
     resp["query"] = { "q": q }
     return resp
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 54d3f71..a86fe15 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -202,7 +202,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
         primary_release = [r for r in heavy.releases if r.ident == heavy.biblio_release_ident][0]
         biblio = es_biblio_from_release(primary_release)
 
-        # TODO: abstracts from releases also? abstracts_dict?
+        # TODO: abstracts from releases also; abstracts_dict; abstracts from GROBID parse
         abstracts = es_abstracts_from_release(primary_release)
     else:
         raise NotImplementedError(f"doc_type: {heavy.doc_type}")
@@ -215,8 +215,9 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
         fulltext = es_fulltext_from_grobid(heavy.grobid_fulltext['tei_xml'], fulltext_release, fulltext_file)
 
         # hack to pull through thumbnail from local pdftotext
-        if fulltext and not fulltext.thumbnail_url and heavy.pdftotext_fulltext:
-            fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/sha1/{fulltext_file.sha1}" # XXX
+        if fulltext and fulltext.file_sha1 and not fulltext.thumbnail_url and heavy.pdftotext_fulltext:
+            # https://covid19.fatcat.wiki/fulltext_web/thumbnail/c9/c9e87f843b3cf7dc47881fa3d3ccb4693d7d9521.png
+            fulltext.thumbnail_url = f"https://covid19.fatcat.wiki/fulltext_web/thumbnail/{fulltext.file_sha1[:2]}/{fulltext.file_sha1}.png"
 
     if not fulltext and heavy.pdftotext_fulltext:
author	Bryan Newbold <bnewbold@archive.org>	2020-05-20 17:04:26 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-05-20 17:04:26 -0700
commit	a3875b4715f957be86049b4d90d43b0ac5af1b75 (patch)
tree	6d9630aae2734e978c6229ed38dd55de32c83fc7 /fatcat_scholar
parent	7597c6192dfe568d72fa26c9666b2a9a40b37a6f (diff)
download	fatcat-scholar-a3875b4715f957be86049b4d90d43b0ac5af1b75.tar.gz fatcat-scholar-a3875b4715f957be86049b4d90d43b0ac5af1b75.zip