diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2021-07-23 11:56:42 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-07-23 11:56:42 -0700 | 
| commit | 7489ef7a979574effa74f1f17cebb81eefb1b71a (patch) | |
| tree | 252bc76358fb769aa52305d45e449c547a740f33 /python/fatcat_tools | |
| parent | 0d17bad63b2d92220b8ddaeb9b5733b2b09f57a0 (diff) | |
| download | fatcat-7489ef7a979574effa74f1f17cebb81eefb1b71a.tar.gz fatcat-7489ef7a979574effa74f1f17cebb81eefb1b71a.zip | |
refs: refactor web paths; enrich refs as generic; remove old refs link
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/references.py | 85 | 
1 files changed, 35 insertions, 50 deletions
| diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 1d8a0d0d..a0079efd 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -3,26 +3,13 @@ Helper routines for working with the fatcat citation graph, which is a separate  index of reference links between works in the main catalog.  See bulk citation and citation API proposals for design documentation. - -TODO: - -    surt_ify() helper (URL to SURT for queries) -    CSL enrichment method (using only elasticsearch mget) -    CSL enrichment for fatcat enrichment -    access transform -    microfilm access in access transform - -    all_outbound_refs(...) -> List[BiblioRef] -    all_inbound_refs(...) -> List[BiblioRef] -        same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?) -        (optional; maybe not public)  """  import sys  import json  import datetime  import argparse -from typing import Optional, List, Any, Dict +from typing import Optional, List, Any, Dict, Union  from pydantic import BaseModel  import elasticsearch @@ -45,8 +32,6 @@ class BiblioRef(BaseModel):      source_work_ident: Optional[str]      # with lang prefix like "en:Superglue"      source_wikipedia_article: Optional[str] -    # skipped: source_openlibrary_work -    # skipped: source_url_surt      source_release_stage: Optional[str]      source_year: Optional[int] @@ -65,7 +50,6 @@ class BiblioRef(BaseModel):      target_url_surt: Optional[str]      # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform      target_url: Optional[str] -    # skipped: target_wikipedia_article      # crossref, pubmed, grobid, etc      match_provenance: Optional[str] @@ -92,31 +76,20 @@ class BiblioRef(BaseModel):          # TODO: if target_openlibrary_work, add an access option?          return self -class CslBiblioRef(BaseModel): -    # an "enriched" version of BiblioRef with metadata about the source or -    # target entity. would be "hydrated" via a lookup to, eg, the -    # `fatcat_release` elasticsearch index (fast mget fetch with a single -    # request), as opposed to fatcat API fetches -    ref: BiblioRef -    csl: Optional[Dict[str, Any]] -    access: List[AccessOption] - -    class Config: -        arbitrary_types_allowed = True -class FatcatBiblioRef(BaseModel): +class EnrichedBiblioRef(BaseModel):      # enriched version of BiblioRef with complete ReleaseEntity object as      # fetched from the fatcat API. CSL-JSON metadata would be derived from      # the full release entity.      ref: BiblioRef      release: Optional[ReleaseEntity]      # TODO: openlibrary work? -    #csl: Optional[Dict[str, Any]]      access: List[AccessOption]      class Config:          arbitrary_types_allowed = True +  class RefHits(BaseModel):      count_returned: int      count_total: int @@ -124,9 +97,13 @@ class RefHits(BaseModel):      limit: int      query_time_ms: int      query_wall_time_ms: int -    result_refs: List[BiblioRef] +    result_refs: List[Union[BiblioRef,EnrichedBiblioRef]] -def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]: + +def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits: +    """ +    Internal helper for querying elasticsearch refs index and transforming hits +    """      limit = min((int(limit or 15), 200))      if not offset or offset < 0: @@ -179,7 +156,7 @@ def get_outbound_refs(      limit: int = 100,      offset: Optional[int] = None,      es_index: str = "fatcat_ref", -) -> List[BiblioRef]: +) -> RefHits:      search = Search(using=es_client, index=es_index) @@ -199,6 +176,7 @@ def get_outbound_refs(      hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0)      return hits +  def get_inbound_refs(      es_client: Any,      release_ident: Optional[str] = None, @@ -208,12 +186,11 @@ def get_inbound_refs(      url: Optional[str] = None,      consolidate_works: bool = True,      filter_stage: List[str] = [], -    filter_type: List[str] = [], +    sort: Optional[str] = None,      limit: int = 25,      offset: Optional[int] = None,      es_index: str = "fatcat_ref",  ) -> List[BiblioRef]: -    # TODO: filter_stage, filter_type      if url and not url_surt:          url = surt_ify(url) @@ -239,10 +216,19 @@ def get_inbound_refs(      else:          raise ValueError("require a lookup key") -    search = search.sort("-source_year") +    if filter_stage: +        search = search.filter("term", source_stage=filter_stage) + +    if sort == "newest": +        search = search.sort("-source_year") +    elif sort == "oldest": +        search = search.sort("source_year") +    else: +        search = search.sort("-source_year")      return _execute_ref_query(search, limit=limit, offset=offset) +  def count_inbound_refs(      es_client: Any,      release_ident: Optional[str] = None, @@ -251,7 +237,6 @@ def count_inbound_refs(      url_surt: Optional[str] = None,      url: Optional[str] = None,      filter_stage: List[str] = [], -    filter_type: List[str] = [],      es_index: str = "fatcat_ref",  ) -> int:      """ @@ -274,28 +259,26 @@ def count_inbound_refs(      else:          raise ValueError("require a lookup key") +    if filter_stage: +        search = search.filter("term", source_stage=filter_stage) +      return search.count() -# run elasticsearch mget query for all ref idents and include "enriched" refs when possible -# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL -# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? -#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] -#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef]  # run fatcat API fetches for each ref and return "enriched" refs -def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: +def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:      enriched = []      for ref in refs:          if ref.source_release_ident:              release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) -            enriched.append(FatcatBiblioRef( +            enriched.append(EnrichedBiblioRef(                  ref=ref,                  #csl=None,                  access=release_access_options(release),                  release=release,              ))          else: -            enriched.append(FatcatBiblioRef( +            enriched.append(EnrichedBiblioRef(                  ref=ref,                  #csl=None,                  access=[], @@ -303,21 +286,20 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi              ))      return enriched -def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: + +def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]:      enriched = []      for ref in refs:          if ref.target_release_ident:              release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) -            enriched.append(FatcatBiblioRef( +            enriched.append(EnrichedBiblioRef(                  ref=ref, -                #csl=None,                  access=release_access_options(release),                  release=release,              ))          else: -            enriched.append(FatcatBiblioRef( +            enriched.append(EnrichedBiblioRef(                  ref=ref, -                #csl=None,                  access=[],                  release=None,              )) @@ -325,6 +307,9 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h  def run_ref_query(args) -> None: +    """ +    CLI helper/debug tool (prints to stdout) +    """      release_ident = None      work_ident = None      if args.ident.startswith("release_"): | 
