From 7489ef7a979574effa74f1f17cebb81eefb1b71a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 23 Jul 2021 11:56:42 -0700 Subject: refs: refactor web paths; enrich refs as generic; remove old refs link --- python/fatcat_tools/references.py | 85 ++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 50 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 1d8a0d0d..a0079efd 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -3,26 +3,13 @@ Helper routines for working with the fatcat citation graph, which is a separate index of reference links between works in the main catalog. See bulk citation and citation API proposals for design documentation. - -TODO: - - surt_ify() helper (URL to SURT for queries) - CSL enrichment method (using only elasticsearch mget) - CSL enrichment for fatcat enrichment - access transform - microfilm access in access transform - - all_outbound_refs(...) -> List[BiblioRef] - all_inbound_refs(...) -> List[BiblioRef] - same as get_outbound_refs()/get_inbound_refs(), but does a scroll (return list or iterator?) - (optional; maybe not public) """ import sys import json import datetime import argparse -from typing import Optional, List, Any, Dict +from typing import Optional, List, Any, Dict, Union from pydantic import BaseModel import elasticsearch @@ -45,8 +32,6 @@ class BiblioRef(BaseModel): source_work_ident: Optional[str] # with lang prefix like "en:Superglue" source_wikipedia_article: Optional[str] - # skipped: source_openlibrary_work - # skipped: source_url_surt source_release_stage: Optional[str] source_year: Optional[int] @@ -65,7 +50,6 @@ class BiblioRef(BaseModel): target_url_surt: Optional[str] # would not be stored in elasticsearch, but would be auto-generated by all "get" methods from the SURT, so calling code does not need to do SURT transform target_url: Optional[str] - # skipped: target_wikipedia_article # crossref, pubmed, grobid, etc match_provenance: Optional[str] @@ -92,31 +76,20 @@ class BiblioRef(BaseModel): # TODO: if target_openlibrary_work, add an access option? return self -class CslBiblioRef(BaseModel): - # an "enriched" version of BiblioRef with metadata about the source or - # target entity. would be "hydrated" via a lookup to, eg, the - # `fatcat_release` elasticsearch index (fast mget fetch with a single - # request), as opposed to fatcat API fetches - ref: BiblioRef - csl: Optional[Dict[str, Any]] - access: List[AccessOption] - - class Config: - arbitrary_types_allowed = True -class FatcatBiblioRef(BaseModel): +class EnrichedBiblioRef(BaseModel): # enriched version of BiblioRef with complete ReleaseEntity object as # fetched from the fatcat API. CSL-JSON metadata would be derived from # the full release entity. ref: BiblioRef release: Optional[ReleaseEntity] # TODO: openlibrary work? - #csl: Optional[Dict[str, Any]] access: List[AccessOption] class Config: arbitrary_types_allowed = True + class RefHits(BaseModel): count_returned: int count_total: int @@ -124,9 +97,13 @@ class RefHits(BaseModel): limit: int query_time_ms: int query_wall_time_ms: int - result_refs: List[BiblioRef] + result_refs: List[Union[BiblioRef,EnrichedBiblioRef]] -def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> List[BiblioRef]: + +def _execute_ref_query(search: Any, limit: int, offset: Optional[int] = None) -> RefHits: + """ + Internal helper for querying elasticsearch refs index and transforming hits + """ limit = min((int(limit or 15), 200)) if not offset or offset < 0: @@ -179,7 +156,7 @@ def get_outbound_refs( limit: int = 100, offset: Optional[int] = None, es_index: str = "fatcat_ref", -) -> List[BiblioRef]: +) -> RefHits: search = Search(using=es_client, index=es_index) @@ -199,6 +176,7 @@ def get_outbound_refs( hits.result_refs = sorted(hits.result_refs, key=lambda r: r.ref_index or 0) return hits + def get_inbound_refs( es_client: Any, release_ident: Optional[str] = None, @@ -208,12 +186,11 @@ def get_inbound_refs( url: Optional[str] = None, consolidate_works: bool = True, filter_stage: List[str] = [], - filter_type: List[str] = [], + sort: Optional[str] = None, limit: int = 25, offset: Optional[int] = None, es_index: str = "fatcat_ref", ) -> List[BiblioRef]: - # TODO: filter_stage, filter_type if url and not url_surt: url = surt_ify(url) @@ -239,10 +216,19 @@ def get_inbound_refs( else: raise ValueError("require a lookup key") - search = search.sort("-source_year") + if filter_stage: + search = search.filter("term", source_stage=filter_stage) + + if sort == "newest": + search = search.sort("-source_year") + elif sort == "oldest": + search = search.sort("source_year") + else: + search = search.sort("-source_year") return _execute_ref_query(search, limit=limit, offset=offset) + def count_inbound_refs( es_client: Any, release_ident: Optional[str] = None, @@ -251,7 +237,6 @@ def count_inbound_refs( url_surt: Optional[str] = None, url: Optional[str] = None, filter_stage: List[str] = [], - filter_type: List[str] = [], es_index: str = "fatcat_ref", ) -> int: """ @@ -274,28 +259,26 @@ def count_inbound_refs( else: raise ValueError("require a lookup key") + if filter_stage: + search = search.filter("term", source_stage=filter_stage) + return search.count() -# run elasticsearch mget query for all ref idents and include "enriched" refs when possible -# for outbound URL refs, would do wayback CDX fetches to find a direct wayback URL -# TODO: for openlibrary, would this query openlibrary.org API? or some fatcat-specific index? -#enrich_inbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] -#enrich_outbound_refs(refs: List[BiblioRef]) -> List[CslBiblioRef] # run fatcat API fetches for each ref and return "enriched" refs -def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: +def enrich_inbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: if ref.source_release_ident: release = fatcat_api_client.get_release(ref.source_release_ident, hide=hide, expand=expand) - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, #csl=None, access=release_access_options(release), release=release, )) else: - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, #csl=None, access=[], @@ -303,21 +286,20 @@ def enrich_inbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hi )) return enriched -def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[FatcatBiblioRef]: + +def enrich_outbound_refs(refs: List[BiblioRef], fatcat_api_client: Any, hide: Optional[str] = "refs", expand: Optional[str] = "container,files,webcaptures,filesets") -> List[EnrichedBiblioRef]: enriched = [] for ref in refs: if ref.target_release_ident: release = fatcat_api_client.get_release(ref.target_release_ident, hide=hide, expand=expand) - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, - #csl=None, access=release_access_options(release), release=release, )) else: - enriched.append(FatcatBiblioRef( + enriched.append(EnrichedBiblioRef( ref=ref, - #csl=None, access=[], release=None, )) @@ -325,6 +307,9 @@ def enrich_outbound_refs_fatcat(refs: List[BiblioRef], fatcat_api_client: Any, h def run_ref_query(args) -> None: + """ + CLI helper/debug tool (prints to stdout) + """ release_ident = None work_ident = None if args.ident.startswith("release_"): -- cgit v1.2.3