From 7489ef7a979574effa74f1f17cebb81eefb1b71a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 23 Jul 2021 11:56:42 -0700 Subject: refs: refactor web paths; enrich refs as generic; remove old refs link --- python/fatcat_tools/references.py | 85 +++++++++------------- python/fatcat_web/ref_routes.py | 16 ++-- python/fatcat_web/templates/entity_base.html | 5 +- .../templates/release_view_fuzzy_refs.html | 12 +-- 4 files changed, 52 insertions(+), 66 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 1d8a0d0d..a0079efd 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -3,26 +3,13 @@ Helper routines for working with the fatcat citation graph, which is a separate index of reference links between works in the main catalog. See bulk citation and citation API proposals for design documentation. - -TODO: - - surt_ify() helper (URL to SURT for queries) - CSL enrichment method (using only elasticsearch mget) - CSL enrichment for fatcat enrichment - access transform - microfilm access in access transform - - all_outbound_refs(...) -> List[BiblioRef] - all_inbound_refs(...) -> List[BiblioRef] - same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?) - (optional; maybe not public) """ import sys import json import datetime import argparse -from typing import Optional, List, Any, Dict +from typing import Optional, List, Any, Dict, Union from pydantic import BaseModel import elasticsearch @@ -45,8 +32,6 @@ class BiblioRef(BaseModel): source_work_ident: Optional[str] # with lang prefix like "en:Superglue" source_wikipedia_article: Optional[str] - # skipped: source_openlibrary_work - # skipped: source_url_surt source_release_stage: Optional[str] source_year: Optional[int] @@ -65,7 +50,6 @@ class BiblioRef(BaseModel): target_url_surt: Optional[str] # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform target_url: Optional[str] - # skipped: target_wikipedia_article # crossref, pubmed, grobid, etc match_provenance: Optional[str] @@ -92,31 +76,20 @@ class BiblioRef(BaseModel): # TODO: if target_openlibrary_work, add an access option? return self -class CslBiblioRef(BaseModel): - # an "enriched" version of BiblioRef with metadata about the source or - # target entity. would be "hydrated" via a lookup to, eg, the - # `fatcat_release` elasticsearch index (fast mget fetch with a single - # request), as opposed to fatcat API fetches - ref: BiblioRef - csl: Optional[Dict[str, Any]] - access: List[AccessOption] - - class Config: - arbitrary_types_allowed = True -class FatcatBiblioRef(BaseModel): +class EnrichedBiblioRef(BaseModel): # enriched version of BiblioRef with complete ReleaseEntity object as # fetched from the fatcat API. CSL-JSON metadata would be derived from # the full release entity. ref: BiblioRef release: Optional[ReleaseEntity] # TODO: openlibrary work? - #csl: Optional[Dict[str, Any]] access: List[AccessOption] class Config: arbitrary_types_allowed = True + class RefHits(BaseModel): count_returned: int count_total: int @@ -124,9 +97,13 @@ class RefHits(BaseModel): limit: int query_time_ms: int query_wall_time_ms: int - result_refs: List[BiblioRef] + result_refs: List[Union[BiblioRef,EnrichedBiblioRef]] -def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]: + +def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits: + """ + Internal helper for querying elasticsearch refs index and transforming hits + """ limit = min((int(limit or 15), 200)) if not offset or offset < 0: @@ -179,7 +156,7 @@ def get_outbound_refs( limit: int = 100, offset: Optional[int] = None, es_index: str = "fatcat_ref", -) -> List[BiblioRef]: +) -> RefHits: search = Search(using=es_client, index=es_index) @@ -199,6 +176,7 @@ def get_outbound_refs( hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0) return hits + def get_inbound_refs( es_client: Any, release_ident: Optional[str] = None, @@ -208,12 +186,11 @@ def get_inbound_refs( url: Optional[str] = None, consolidate_works: bool = True, filter_stage: List[str] = [], - filter_type: List[str] = [], + sort: Optional[str] = None, limit: int = 25, offset: Optional[int] = None, es_index: str = "fatcat_ref", ) -> List[BiblioRef]: - # TODO: filter_stage, filter_type if url and not url_surt: url = surt_ify(url) @@ -239,10 +216,19 @@ def get_inbound_refs( else: raise ValueError("require a lookup key") - search = search.sort("-source_year") + if filter_stage: + search = search.filter("term", source_stage=filter_stage) + + if sort == "newest": + search = search.sort("-source_year") + elif sort == "oldest": + search = search.sort("source_year") + else: + search = search.sort("-source_year") return _execute_ref_query(search, limit=limit, offset=offset) + def count_inbound_refs( es_client: Any, release_ident: Optional[str] = None, @@ -251,7 +237,6 @@ def count_inbound_refs( url_surt: Optional[str] = None, url: Optional[str] = None, filter_stage: List[str] = [], - filter_type: List[str] = [], es_index: str = "fatcat_ref", ) -> int: """ @@ -274,28 +259,26 @@ def count_inbound_refs( else: raise ValueError("require a lookup key") + if filter_stage: + search = search.filter("term", source_stage=filter_stage) + return search.count() -# run elasticsearch mget query for all ref idents and include "enriched" refs when possible -# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL -# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? -#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] -#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] # run fatcat API fetches for each ref and return "enriched" refs -def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: +def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: if ref.source_release_ident: release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, #csl=None, access=release_access_options(release), release=release, )) else: - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, #csl=None, access=[], @@ -303,21 +286,20 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi )) return enriched -def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: + +def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: if ref.target_release_ident: release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, - #csl=None, access=release_access_options(release), release=release, )) else: - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, - #csl=None, access=[], release=None, )) @@ -325,6 +307,9 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h def run_ref_query(args) -> None: + """ + CLI helper/debug tool (prints to stdout) + """ release_ident = None work_ident = None if args.ident.startswith("release_"): diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index e08aaf15..e24b4ac6 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -11,14 +11,14 @@ from fatcat_openapi_client.rest import ApiException from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches -from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs +from fatcat_tools.references import enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, get_outbound_refs from fatcat_tools.transforms.access import release_access_options from fatcat_web import app, api, auth_api from fatcat_web.forms import * from fatcat_web.entity_helpers import * -@app.route('/release//inbound-refs', methods=['GET']) +@app.route('/release//refs/in', methods=['GET']) def release_view_refs_inbound(ident): release = generic_get_entity("release", ident) @@ -27,11 +27,12 @@ def release_view_refs_inbound(ident): offset = max(0, int(offset)) if offset.isnumeric() else 0 hits = get_inbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) - enriched_refs = enrich_inbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") + enriched_refs = enrich_inbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") - return render_template('release_view_fuzzy_refs.html', direction="inbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 + return render_template('release_view_fuzzy_refs.html', direction="in", entity=release, hits=hits, enriched_refs=enriched_refs), 200 -@app.route('/release//outbound-refs', methods=['GET']) + +@app.route('/release//refs/out', methods=['GET']) def release_view_refs_outbound(ident): release = generic_get_entity("release", ident) @@ -40,9 +41,10 @@ def release_view_refs_outbound(ident): offset = max(0, int(offset)) if offset.isnumeric() else 0 hits = get_outbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) - enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") + enriched_refs = enrich_outbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") + + return render_template('release_view_fuzzy_refs.html', direction="out", entity=release, hits=hits, enriched_refs=enriched_refs), 200 - return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 @app.route('/reference/match', methods=['GET', 'POST']) def reference_match(): diff --git a/python/fatcat_web/templates/entity_base.html b/python/fatcat_web/templates/entity_base.html index c23dbef2..78a151a0 100644 --- a/python/fatcat_web/templates/entity_base.html +++ b/python/fatcat_web/templates/entity_base.html @@ -85,10 +85,9 @@ {{ entity_tab("coverage", "Coverage", "/coverage") }} {% elif entity_type == "release" and entity.state != 'deleted' %} {{ entity_tab("contribs", "Authors", "/contribs", entity._authors|count ) }} - {{ entity_tab("references", "References", "/references", entity.refs|count) }} {% if entity.state == 'active' %} - {{ entity_tab("inbound-refs", "Inbound", "/inbound-refs") }} - {{ entity_tab("outbound-refs", "Outbound", "/outbound-refs") }} + {{ entity_tab("refs-out", "References", "/refs/out") }} + {{ entity_tab("refs-in", "Cited By", "/refs/in") }} {% endif %} {% endif %} {{ entity_tab("metadata", "Metadata", "/metadata") }} diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html index 7b286fd3..43860a31 100644 --- a/python/fatcat_web/templates/release_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html @@ -1,5 +1,5 @@ {% set release = entity %} -{% set entity_view = "{{ direction }}-refs" %} +{% set entity_view = "refs-" + direction %} {% set entity_type = "release" %} {% import "entity_macros.html" as entity_macros %} {% extends "entity_base.html" %} @@ -17,10 +17,10 @@ {% block entity_main %} -{% if direction == "inbound" %} -

Referenced By

- Citations to this release by other works. -{% elif direction == "outbound" %} +{% if direction == "in" %} +

Cited By

+ References to this release by other works. +{% elif direction == "out" %}

References

NOTE: currently batch computed and may include additional references sources, or be missing recent changes, compared to entity reference list. {% endif %} @@ -36,7 +36,7 @@ {% set release = row.release %} {# TODO: ref_locator? #} - {% if direction == "outbound" %} + {% if direction == "out" %} {% if row.ref.ref_key %} [{{ row.ref.ref_key }}]
{% endif %} -- cgit v1.2.3