From 198db52d3a93a2b7d7cab0a4140c6402a14eca84 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 4 Jun 2020 13:18:35 -0700 Subject: collapse pages by SIM issue --- fatcat_scholar/schema.py | 1 + fatcat_scholar/search.py | 28 ++++++++++++++++++--- fatcat_scholar/templates/search.html | 2 -- fatcat_scholar/templates/search_macros.html | 39 ++++++++++++++++++----------- fatcat_scholar/transform.py | 3 +++ proposals/work_schema.md | 3 ++- schema/scholar_fulltext.v01.json | 1 + 7 files changed, 57 insertions(+), 20 deletions(-) diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index c5f2927..1d2e7a3 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -165,6 +165,7 @@ class ScholarDoc(BaseModel): key: str doc_type: str # enum: work or page doc_index_ts: datetime.datetime + collapse_key: str work_ident: Optional[str] tags: List[str] = [] diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index bfc7c6e..ce06fb7 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -28,6 +28,7 @@ class FulltextQuery(BaseModel): filter_type: Optional[str] = None filter_availability: Optional[str] = None sort_order: Optional[str] = None + collapse_pages: bool = True time_options: Any = { "label": gettext("Release Date"), "slug": "filter_time", @@ -196,6 +197,13 @@ def do_fulltext_search( number_of_fragments=2, fragment_size=300, ) + if query.collapse_pages: + search = search.extra( + collapse={ + "field": "collapse_key", + "inner_hits": {"name": "more_pages", "size": 0,}, + } + ) # sort order if query.sort_order == "time_asc": @@ -234,12 +242,19 @@ def do_fulltext_search( results = [] for h in resp: r = h._d_ - # print(json.dumps(h.meta._d_, indent=2)) + # print(h.meta._d_) r["_highlights"] = [] if "highlight" in dir(h.meta): highlights = h.meta.highlight._d_ for k in highlights: r["_highlights"] += highlights[k] + r["_collapsed"] = [] + r["_collapsed_count"] = 0 + if "inner_hits" in dir(h.meta): + r["_collapsed_count"] = h.meta.inner_hits.more_pages.hits.total - 1 + for k in h.meta.inner_hits.more_pages: + if k["key"] != r["key"]: + r["_collapsed"].append(k) results.append(r) for h in results: @@ -250,9 +265,16 @@ def do_fulltext_search( if type(h[key]) is str: h[key] = h[key].encode("utf8", "ignore").decode("utf8") + count_found: int = int(resp.hits.total) + count_returned = len(results) + + # if we grouped to less than a page of hits, update returned count + if query.collapse_pages and offset == 0 and (count_returned < limit): + count_found = count_returned + return FulltextHits( - count_returned=len(results), - count_found=int(resp.hits.total), + count_returned=count_returned, + count_found=count_found, offset=offset, limit=limit, deep_page_limit=deep_page_limit, diff --git a/fatcat_scholar/templates/search.html b/fatcat_scholar/templates/search.html index 1f7a9b9..13d1aec 100644 --- a/fatcat_scholar/templates/search.html +++ b/fatcat_scholar/templates/search.html @@ -21,7 +21,6 @@ {% if hits %} - {#

{{ "{:,}".format(hits.count_found) }}

#} {{ "{:,}".format(hits.count_found) }} Hits in {{ "{:0.2}".format(hits.query_time_ms/1000.0) }}sec @@ -35,7 +34,6 @@
{% if hits %}
- {#

{{ "{:,}".format(hits.count_found) }}

#}

{{ "{:,}".format(hits.count_found) }}

Hits
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html index 07a4510..abdb6d3 100644 --- a/fatcat_scholar/templates/search_macros.html +++ b/fatcat_scholar/templates/search_macros.html @@ -67,7 +67,7 @@ {{ paper.biblio.release_year }} {% endif %} {% if paper.biblio.release_year and paper.biblio.container_name %} - + {% endif %} {% if paper.biblio.container_name %} @@ -130,22 +130,33 @@ {# ### TAGS #}
- {# colors to use: olive, brown, grey, pink, red, etc #} - {# TODO: remove doc for ES 7.x-style lack of type #} - {# TODO: only show 'json' link if from cluster? #} - - json - - {% if paper.biblio.release_ident %} - - metadata + {# colors to use: olive, brown, grey, pink, red, etc #} + {# TODO: remove doc for ES 7.x-style lack of type #} + {# TODO: only show 'json' link if from cluster? #} + + json - {% endif %} - {% for tag in paper.tags %} - {{ _(tag) }} - {% endfor %} + {% if paper.biblio.release_ident %} + + metadata + + {% endif %} + + {% for tag in paper.tags %} + {{ _(tag) }} + {% endfor %}
+ {# ### COLLAPSED HITS #} + {% if paper._collapsed_count > 0 %} +
+ +
+ {% endif %} +
{% if paper.fulltext.access_url %} diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 3d47fb4..847cc6e 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -187,6 +187,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: tags: List[str] = [] work_ident: Optional[str] = None + sim_issue: Optional[str] = None abstracts: List[ScholarAbstract] = [] fulltext: Optional[ScholarFulltext] = None primary_release: Optional[ReleaseEntity] = None @@ -199,6 +200,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: if heavy.doc_type == DocType.sim_page: assert ia_sim is not None key = f"page_{ia_sim.issue_item}_{ia_sim.first_page}" + sim_issue = ia_sim.issue_item biblio = es_biblio_from_sim(heavy.sim_fulltext) fulltext = es_fulltext_from_sim(heavy.sim_fulltext) elif heavy.doc_type == DocType.work: @@ -316,6 +318,7 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: return ScholarDoc( key=key, + collapse_key=sim_issue or work_ident, doc_type=heavy.doc_type.value, doc_index_ts=datetime.datetime.utcnow(), work_ident=work_ident, diff --git a/proposals/work_schema.md b/proposals/work_schema.md index 933e750..97d60ac 100644 --- a/proposals/work_schema.md +++ b/proposals/work_schema.md @@ -3,9 +3,10 @@ - type: `_doc` (aka, no type, `include_type_name=false`) - key: keyword (same as `_id`) +- `collapse_key`: work ident, or SIM issue item (for collapsing/grouping search hits) - `doc_type`: keyword (work or page) - `doc_index_ts`: timestamp when document indexed -- `work_id`: fatcat work ident (optional) +- `work_ident`: fatcat work ident (optional) - `biblio`: obj - `fulltext`: obj diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json index 7653633..1929512 100644 --- a/schema/scholar_fulltext.v01.json +++ b/schema/scholar_fulltext.v01.json @@ -63,6 +63,7 @@ "properties": { "key": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "collapse_key": { "type": "keyword", "normalizer": "default" }, "doc_type": { "type": "keyword", "normalizer": "default" }, "doc_index_ts": { "type": "date" }, "work_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, -- cgit v1.2.3