diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/references.py | 85 | ||||
| -rw-r--r-- | python/fatcat_web/ref_routes.py | 16 | ||||
| -rw-r--r-- | python/fatcat_web/templates/entity_base.html | 5 | ||||
| -rw-r--r-- | python/fatcat_web/templates/release_view_fuzzy_refs.html | 12 | 
4 files changed, 52 insertions, 66 deletions
| diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 1d8a0d0d..a0079efd 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -3,26 +3,13 @@ Helper routines for working with the fatcat citation graph, which is a separate  index of reference links between works in the main catalog.  See bulk citation and citation API proposals for design documentation. - -TODO: - -    surt_ify() helper (URL to SURT for queries) -    CSL enrichment method (using only elasticsearch mget) -    CSL enrichment for fatcat enrichment -    access transform -    microfilm access in access transform - -    all_outbound_refs(...) -> List[BiblioRef] -    all_inbound_refs(...) -> List[BiblioRef] -        same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?) -        (optional; maybe not public)  """  import sys  import json  import datetime  import argparse -from typing import Optional, List, Any, Dict +from typing import Optional, List, Any, Dict, Union  from pydantic import BaseModel  import elasticsearch @@ -45,8 +32,6 @@ class BiblioRef(BaseModel):      source_work_ident: Optional[str]      # with lang prefix like "en:Superglue"      source_wikipedia_article: Optional[str] -    # skipped: source_openlibrary_work -    # skipped: source_url_surt      source_release_stage: Optional[str]      source_year: Optional[int] @@ -65,7 +50,6 @@ class BiblioRef(BaseModel):      target_url_surt: Optional[str]      # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform      target_url: Optional[str] -    # skipped: target_wikipedia_article      # crossref, pubmed, grobid, etc      match_provenance: Optional[str] @@ -92,31 +76,20 @@ class BiblioRef(BaseModel):          # TODO: if target_openlibrary_work, add an access option?          return self -class CslBiblioRef(BaseModel): -    # an "enriched" version of BiblioRef with metadata about the source or -    # target entity. would be "hydrated" via a lookup to, eg, the -    # `fatcat_release` elasticsearch index (fast mget fetch with a single -    # request), as opposed to fatcat API fetches -    ref: BiblioRef -    csl: Optional[Dict[str, Any]] -    access: List[AccessOption] - -    class Config: -        arbitrary_types_allowed = True -class FatcatBiblioRef(BaseModel): +class EnrichedBiblioRef(BaseModel):      # enriched version of BiblioRef with complete ReleaseEntity object as      # fetched from the fatcat API. CSL-JSON metadata would be derived from      # the full release entity.      ref: BiblioRef      release: Optional[ReleaseEntity]      # TODO: openlibrary work? -    #csl: Optional[Dict[str, Any]]      access: List[AccessOption]      class Config:          arbitrary_types_allowed = True +  class RefHits(BaseModel):      count_returned: int      count_total: int @@ -124,9 +97,13 @@ class RefHits(BaseModel):      limit: int      query_time_ms: int      query_wall_time_ms: int -    result_refs: List[BiblioRef] +    result_refs: List[Union[BiblioRef,EnrichedBiblioRef]] -def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]: + +def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits: +    """ +    Internal helper for querying elasticsearch refs index and transforming hits +    """      limit = min((int(limit or 15), 200))      if not offset or offset < 0: @@ -179,7 +156,7 @@ def get_outbound_refs(      limit: int = 100,      offset: Optional[int] = None,      es_index: str = "fatcat_ref", -) -> List[BiblioRef]: +) -> RefHits:      search = Search(using=es_client, index=es_index) @@ -199,6 +176,7 @@ def get_outbound_refs(      hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0)      return hits +  def get_inbound_refs(      es_client: Any,      release_ident: Optional[str] = None, @@ -208,12 +186,11 @@ def get_inbound_refs(      url: Optional[str] = None,      consolidate_works: bool = True,      filter_stage: List[str] = [], -    filter_type: List[str] = [], +    sort: Optional[str] = None,      limit: int = 25,      offset: Optional[int] = None,      es_index: str = "fatcat_ref",  ) -> List[BiblioRef]: -    # TODO: filter_stage, filter_type      if url and not url_surt:          url = surt_ify(url) @@ -239,10 +216,19 @@ def get_inbound_refs(      else:          raise ValueError("require a lookup key") -    search = search.sort("-source_year") +    if filter_stage: +        search = search.filter("term", source_stage=filter_stage) + +    if sort == "newest": +        search = search.sort("-source_year") +    elif sort == "oldest": +        search = search.sort("source_year") +    else: +        search = search.sort("-source_year")      return _execute_ref_query(search, limit=limit, offset=offset) +  def count_inbound_refs(      es_client: Any,      release_ident: Optional[str] = None, @@ -251,7 +237,6 @@ def count_inbound_refs(      url_surt: Optional[str] = None,      url: Optional[str] = None,      filter_stage: List[str] = [], -    filter_type: List[str] = [],      es_index: str = "fatcat_ref",  ) -> int:      """ @@ -274,28 +259,26 @@ def count_inbound_refs(      else:          raise ValueError("require a lookup key") +    if filter_stage: +        search = search.filter("term", source_stage=filter_stage) +      return search.count() -# run elasticsearch mget query for all ref idents and include "enriched" refs when possible -# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL -# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? -#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] -#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]  # run fatcat API fetches for each ref and return "enriched" refs -def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: +def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:      enriched = []      for ref in refs:          if ref.source_release_ident:              release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) -            enriched.append(FatcatBiblioRef( +            enriched.append(EnrichedBiblioRef(                  ref=ref,                  #csl=None,                  access=release_access_options(release),                  release=release,              ))          else: -            enriched.append(FatcatBiblioRef( +            enriched.append(EnrichedBiblioRef(                  ref=ref,                  #csl=None,                  access=[], @@ -303,21 +286,20 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi              ))      return enriched -def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: + +def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:      enriched = []      for ref in refs:          if ref.target_release_ident:              release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) -            enriched.append(FatcatBiblioRef( +            enriched.append(EnrichedBiblioRef(                  ref=ref, -                #csl=None,                  access=release_access_options(release),                  release=release,              ))          else: -            enriched.append(FatcatBiblioRef( +            enriched.append(EnrichedBiblioRef(                  ref=ref, -                #csl=None,                  access=[],                  release=None,              )) @@ -325,6 +307,9 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h  def run_ref_query(args) -> None: +    """ +    CLI helper/debug tool (prints to stdout) +    """      release_ident = None      work_ident = None      if args.ident.startswith("release_"): diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index e08aaf15..e24b4ac6 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -11,14 +11,14 @@ from fatcat_openapi_client.rest import ApiException  from fuzzycat.grobid_unstructured import grobid_api_process_citation, transform_grobid_ref_xml, grobid_ref_to_release  from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matches -from fatcat_tools.references import enrich_inbound_refs_fatcat, enrich_outbound_refs_fatcat, get_inbound_refs, get_outbound_refs +from fatcat_tools.references import enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, get_outbound_refs  from fatcat_tools.transforms.access import release_access_options  from fatcat_web import app, api, auth_api  from fatcat_web.forms import *  from fatcat_web.entity_helpers import * -@app.route('/release/<string(length=26):ident>/inbound-refs', methods=['GET']) +@app.route('/release/<string(length=26):ident>/refs/in', methods=['GET'])  def release_view_refs_inbound(ident):      release = generic_get_entity("release", ident) @@ -27,11 +27,12 @@ def release_view_refs_inbound(ident):      offset = max(0, int(offset)) if offset.isnumeric() else 0      hits = get_inbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) -    enriched_refs = enrich_inbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") +    enriched_refs = enrich_inbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") -    return render_template('release_view_fuzzy_refs.html', direction="inbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200 +    return render_template('release_view_fuzzy_refs.html', direction="in", entity=release, hits=hits, enriched_refs=enriched_refs), 200 -@app.route('/release/<string(length=26):ident>/outbound-refs', methods=['GET']) + +@app.route('/release/<string(length=26):ident>/refs/out', methods=['GET'])  def release_view_refs_outbound(ident):      release = generic_get_entity("release", ident) @@ -40,9 +41,10 @@ def release_view_refs_outbound(ident):      offset = max(0, int(offset)) if offset.isnumeric() else 0      hits = get_outbound_refs(release_ident=ident, es_client=app.es_client, offset=offset, limit=30) -    enriched_refs = enrich_outbound_refs_fatcat(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") +    enriched_refs = enrich_outbound_refs(hits.result_refs, fatcat_api_client=api, expand="container,files,webcaptures") + +    return render_template('release_view_fuzzy_refs.html', direction="out", entity=release, hits=hits, enriched_refs=enriched_refs), 200 -    return render_template('release_view_fuzzy_refs.html', direction="outbound", entity=release, hits=hits, enriched_refs=enriched_refs), 200  @app.route('/reference/match', methods=['GET', 'POST'])  def reference_match(): diff --git a/python/fatcat_web/templates/entity_base.html b/python/fatcat_web/templates/entity_base.html index c23dbef2..78a151a0 100644 --- a/python/fatcat_web/templates/entity_base.html +++ b/python/fatcat_web/templates/entity_base.html @@ -85,10 +85,9 @@                {{ entity_tab("coverage", "Coverage", "/coverage") }}              {% elif entity_type == "release" and entity.state != 'deleted' %}                {{ entity_tab("contribs", "Authors", "/contribs", entity._authors|count ) }} -              {{ entity_tab("references", "References", "/references", entity.refs|count) }}                {% if  entity.state == 'active' %} -                {{ entity_tab("inbound-refs", "Inbound", "/inbound-refs") }} -                {{ entity_tab("outbound-refs", "Outbound", "/outbound-refs") }} +                {{ entity_tab("refs-out", "References", "/refs/out") }} +                {{ entity_tab("refs-in", "Cited By", "/refs/in") }}                {% endif %}              {% endif %}              {{ entity_tab("metadata", "Metadata", "/metadata") }} diff --git a/python/fatcat_web/templates/release_view_fuzzy_refs.html b/python/fatcat_web/templates/release_view_fuzzy_refs.html index 7b286fd3..43860a31 100644 --- a/python/fatcat_web/templates/release_view_fuzzy_refs.html +++ b/python/fatcat_web/templates/release_view_fuzzy_refs.html @@ -1,5 +1,5 @@  {% set release = entity %} -{% set entity_view = "{{ direction }}-refs" %} +{% set entity_view = "refs-" + direction %}  {% set entity_type = "release" %}  {% import "entity_macros.html" as entity_macros %}  {% extends "entity_base.html" %} @@ -17,10 +17,10 @@  {% block entity_main %} -{% if direction == "inbound" %} -  <h3>Referenced By</h3> -  <i>Citations to this release by other works.</i> -{% elif direction == "outbound" %} +{% if direction == "in" %} +  <h3>Cited By</h3> +  <i>References to this release by other works.</i> +{% elif direction == "out" %}    <h3>References</h3>    <i>NOTE: currently batch computed and may include additional references sources, or be missing recent changes, compared to entity reference list.</i>  {% endif %} @@ -36,7 +36,7 @@    {% set release = row.release %}    <tr><td class="collapsing left aligned top aligned">          {# TODO: ref_locator? #} -        {% if direction == "outbound" %} +        {% if direction == "out" %}            {% if row.ref.ref_key %}              <code>[{{ row.ref.ref_key }}]</code><br>            {% endif %} | 
